diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ab8b75f415870..484b947bda402 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -27,7 +27,6 @@ /llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @nikic /llvm/lib/Transforms/InstCombine/ @nikic -/clang/include/clang/Sema/Sema.h @Endilll /clang/test/CXX/drs/ @Endilll /clang/www/cxx_dr_status.html @Endilll /clang/www/make_cxx_dr_status @Endilll diff --git a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp index dc3a3b6211b7e..80ee31368fe9a 100644 --- a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp +++ b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp @@ -63,7 +63,9 @@ getNewFieldsOrder(const RecordDecl *Definition, NameToIndex[Field->getName()] = Field->getFieldIndex(); if (DesiredFieldsOrder.size() != NameToIndex.size()) { - llvm::errs() << "Number of provided fields doesn't match definition.\n"; + llvm::errs() << "Number of provided fields (" << DesiredFieldsOrder.size() + << ") doesn't match definition (" << NameToIndex.size() + << ").\n"; return {}; } SmallVector NewFieldsOrder; @@ -116,26 +118,77 @@ findMembersUsedInInitExpr(const CXXCtorInitializer *Initializer, return Results; } -/// Returns the full source range for the field declaration up to (not -/// including) the trailing semicolumn, including potential macro invocations, -/// e.g. `int a GUARDED_BY(mu);`. +/// Returns the next token after `Loc` (including comment tokens). +static std::optional getTokenAfter(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { + if (Loc.isMacroID()) { + return std::nullopt; + } + Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); + + // Break down the source location. + std::pair LocInfo = SM.getDecomposedLoc(Loc); + + // Try to load the file buffer. + bool InvalidTemp = false; + StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); + if (InvalidTemp) + return std::nullopt; + + const char *TokenBegin = File.data() + LocInfo.second; + + Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), + TokenBegin, File.end()); + lexer.SetCommentRetentionState(true); + // Find the token. + Token Tok; + lexer.LexFromRawLexer(Tok); + return Tok; +} + +/// Returns the end of the trailing comments after `Loc`. +static SourceLocation getEndOfTrailingComment(SourceLocation Loc, + const SourceManager &SM, + const LangOptions &LangOpts) { + // We consider any following comment token that is indented more than the + // first comment to be part of the trailing comment. + const unsigned Column = SM.getPresumedColumnNumber(Loc); + std::optional Tok = getTokenAfter(Loc, SM, LangOpts); + while (Tok && Tok->is(tok::comment) && + SM.getPresumedColumnNumber(Tok->getLocation()) > Column) { + Loc = Tok->getEndLoc(); + Tok = getTokenAfter(Loc, SM, LangOpts); + } + return Loc; +} + +/// Returns the full source range for the field declaration up to (including) +/// the trailing semicolumn, including potential macro invocations, +/// e.g. `int a GUARDED_BY(mu);`. If there is a trailing comment, include it. static SourceRange getFullFieldSourceRange(const FieldDecl &Field, const ASTContext &Context) { - SourceRange Range = Field.getSourceRange(); + const SourceRange Range = Field.getSourceRange(); + SourceLocation Begin = Range.getBegin(); SourceLocation End = Range.getEnd(); const SourceManager &SM = Context.getSourceManager(); const LangOptions &LangOpts = Context.getLangOpts(); while (true) { std::optional CurrentToken = Lexer::findNextToken(End, SM, LangOpts); - if (!CurrentToken || CurrentToken->is(tok::semi)) - break; + if (!CurrentToken) + return SourceRange(Begin, End); if (CurrentToken->is(tok::eof)) return Range; // Something is wrong, return the original range. + End = CurrentToken->getLastLoc(); + + if (CurrentToken->is(tok::semi)) + break; } - return SourceRange(Range.getBegin(), End); + End = getEndOfTrailingComment(End, SM, LangOpts); + return SourceRange(Begin, End); } /// Reorders fields in the definition of a struct/class. diff --git a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py index 62cb4297c50f7..33de2077dfb1a 100755 --- a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py +++ b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py @@ -35,6 +35,7 @@ import tempfile import threading import traceback +from pathlib import Path try: import yaml @@ -124,6 +125,23 @@ def merge_replacement_files(tmpdir, mergefile): open(mergefile, "w").close() +def get_compiling_files(args): + """Read a compile_commands.json database and return a set of file paths""" + current_dir = Path.cwd() + compile_commands_json = ( + (current_dir / args.build_path) if args.build_path else current_dir + ) + compile_commands_json = compile_commands_json / "compile_commands.json" + files = set() + with open(compile_commands_json) as db_file: + db_json = json.load(db_file) + for entry in db_json: + if "file" not in entry: + continue + files.add(Path(entry["file"])) + return files + + def main(): parser = argparse.ArgumentParser( description="Run clang-tidy against changed files, and " @@ -234,6 +252,13 @@ def main(): action="store_true", help="Allow empty enabled checks.", ) + parser.add_argument( + "-only-check-in-db", + dest="skip_non_compiling", + default=False, + action="store_true", + help="Only check files in the compilation database", + ) clang_tidy_args = [] argv = sys.argv[1:] @@ -243,11 +268,13 @@ def main(): args = parser.parse_args(argv) + compiling_files = get_compiling_files(args) if args.skip_non_compiling else None + # Extract changed lines for each file. filename = None lines_by_file = {} for line in sys.stdin: - match = re.search('^\\+\\+\\+\\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line) + match = re.search(r'^\+\+\+\ "?(.*?/){%s}([^ \t\n"]*)' % args.p, line) if match: filename = match.group(2) if filename is None: @@ -260,6 +287,13 @@ def main(): if not re.match("^%s$" % args.iregex, filename, re.IGNORECASE): continue + # Skip any files not in the compiling list + if ( + compiling_files is not None + and (Path.cwd() / filename) not in compiling_files + ): + continue + match = re.search(r"^@@.*\+(\d+)(,(\d+))?", line) if match: start_line = int(match.group(1)) diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index fb39b7b292242..a8182ce98ebe0 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -1867,14 +1867,41 @@ class CodeCompleteFlow { CodeCompleteResult Output; // Convert the results to final form, assembling the expensive strings. + // If necessary, search the index for documentation comments. + LookupRequest Req; + llvm::DenseMap SymbolToCompletion; for (auto &C : Scored) { Output.Completions.push_back(toCodeCompletion(C.first)); Output.Completions.back().Score = C.second; Output.Completions.back().CompletionTokenRange = ReplacedRange; + if (Opts.Index && !Output.Completions.back().Documentation) { + for (auto &Cand : C.first) { + if (Cand.SemaResult && + Cand.SemaResult->Kind == CodeCompletionResult::RK_Declaration) { + auto ID = clangd::getSymbolID(Cand.SemaResult->getDeclaration()); + if (!ID) + continue; + Req.IDs.insert(ID); + SymbolToCompletion[ID] = Output.Completions.size() - 1; + } + } + } } Output.HasMore = Incomplete; Output.Context = CCContextKind; Output.CompletionRange = ReplacedRange; + + // Look up documentation from the index. + if (Opts.Index) { + Opts.Index->lookup(Req, [&](const Symbol &S) { + if (S.Documentation.empty()) + return; + auto &C = Output.Completions[SymbolToCompletion.at(S.ID)]; + C.Documentation.emplace(); + parseDocumentation(S.Documentation, *C.Documentation); + }); + } + return Output; } diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp index 9d48a6e09fc77..b12f8275b8a26 100644 --- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp +++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp @@ -1136,6 +1136,87 @@ int x = foo^ Contains(AllOf(named("foo"), doc("This comment should be retained!")))); } +TEST(CompletionTest, CommentsOnMembersFromHeader) { + MockFS FS; + MockCompilationDatabase CDB; + + auto Opts = ClangdServer::optsForTest(); + Opts.BuildDynamicSymbolIndex = true; + + ClangdServer Server(CDB, FS, Opts); + + FS.Files[testPath("foo.h")] = R"cpp( + struct alpha { + /// This is a member field. + int gamma; + + /// This is a member function. + int delta(); + }; + )cpp"; + + auto File = testPath("foo.cpp"); + Annotations Test(R"cpp( +#include "foo.h" +alpha a; +int x = a.^ + )cpp"); + runAddDocument(Server, File, Test.code()); + auto CompletionList = + llvm::cantFail(runCodeComplete(Server, File, Test.point(), {})); + + EXPECT_THAT(CompletionList.Completions, + Contains(AllOf(named("gamma"), doc("This is a member field.")))); + EXPECT_THAT( + CompletionList.Completions, + Contains(AllOf(named("delta"), doc("This is a member function.")))); +} + +TEST(CompletionTest, CommentsOnMembersFromHeaderOverloadBundling) { + using testing::AnyOf; + MockFS FS; + MockCompilationDatabase CDB; + + auto Opts = ClangdServer::optsForTest(); + Opts.BuildDynamicSymbolIndex = true; + + ClangdServer Server(CDB, FS, Opts); + + FS.Files[testPath("foo.h")] = R"cpp( + struct alpha { + /// bool overload. + int delta(bool b); + + /// int overload. + int delta(int i); + + void epsilon(long l); + + /// This one has a comment. + void epsilon(int i); + }; + )cpp"; + + auto File = testPath("foo.cpp"); + Annotations Test(R"cpp( +#include "foo.h" +alpha a; +int x = a.^ + )cpp"); + runAddDocument(Server, File, Test.code()); + clangd::CodeCompleteOptions CCOpts; + CCOpts.BundleOverloads = true; + auto CompletionList = + llvm::cantFail(runCodeComplete(Server, File, Test.point(), CCOpts)); + + EXPECT_THAT( + CompletionList.Completions, + Contains(AllOf(named("epsilon"), doc("This one has a comment.")))); + EXPECT_THAT(CompletionList.Completions, + Contains(AllOf(named("delta"), AnyOf(doc("bool overload."), + doc("int overload."))))); +} + TEST(CompletionTest, GlobalCompletionFiltering) { Symbol Class = cls("XYZ"); diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 6350022ed9a8d..8ba47dfc84f26 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -108,6 +108,10 @@ Improvements to clang-query Improvements to clang-tidy -------------------------- +- Improved :program:`clang-tidy-diff.py` script. Add the `-only-check-in-db` + option to exclude files not present in the compilation database, avoiding + false-negative results. + - Improved :program:`run-clang-tidy.py` script. Fixed minor shutdown noise happening on certain platforms when interrupting the script. diff --git a/clang-tools-extra/test/clang-reorder-fields/Comments.cpp b/clang-tools-extra/test/clang-reorder-fields/Comments.cpp new file mode 100644 index 0000000000000..a31b6692c9ac7 --- /dev/null +++ b/clang-tools-extra/test/clang-reorder-fields/Comments.cpp @@ -0,0 +1,23 @@ +// RUN: clang-reorder-fields -record-name Foo -fields-order e1,e3,e2,a,c,b %s -- | FileCheck %s + +class Foo { + int a; // Trailing comment for a. + int b; // Multiline + // trailing for b. + // Prefix comments for c. + int c; + + /*c-like*/ int e1; + int /*c-like*/ e2; + int e3 /*c-like*/; +}; + +// CHECK: /*c-like*/ int e1; +// CHECK-NEXT: int e3 /*c-like*/; +// CHECK-NEXT: int /*c-like*/ e2; +// CHECK-NEXT: int a; // Trailing comment for a. +// CHECK-NEXT: // Prefix comments for c. +// CHECK-NEXT: int c; +// CHECK-NEXT: int b; // Multiline +// CHECK-NEXT: // trailing for b. + diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py index 710259de855f9..806e1b40f3c9e 100644 --- a/clang/bindings/python/clang/cindex.py +++ b/clang/bindings/python/clang/cindex.py @@ -2133,6 +2133,14 @@ def get_field_offsetof(self): """Returns the offsetof the FIELD_DECL pointed by this Cursor.""" return conf.lib.clang_Cursor_getOffsetOfField(self) # type: ignore [no-any-return] + def get_base_offsetof(self, parent): + """Returns the offsetof the CXX_BASE_SPECIFIER pointed by this Cursor.""" + return conf.lib.clang_getOffsetOfBase(parent, self) # type: ignore [no-any-return] + + def is_virtual_base(self): + """Returns whether the CXX_BASE_SPECIFIER pointed by this Cursor is virtual.""" + return conf.lib.clang_isVirtualBase(self) # type: ignore [no-any-return] + def is_anonymous(self): """ Check whether this is a record type without a name, or a field where @@ -2687,6 +2695,21 @@ def visitor(field, children): conf.lib.clang_Type_visitFields(self, fields_visit_callback(visitor), fields) return iter(fields) + def get_bases(self): + """Return an iterator for accessing the base classes of this type.""" + + def visitor(base, children): + assert base != conf.lib.clang_getNullCursor() + + # Create reference to TU so it isn't GC'd before Cursor. + base._tu = self._tu + bases.append(base) + return 1 # continue + + bases: list[Cursor] = [] + conf.lib.clang_visitCXXBaseClasses(self, fields_visit_callback(visitor), bases) + return iter(bases) + def get_exception_specification_kind(self): """ Return the kind of the exception specification; a value from @@ -3940,6 +3963,7 @@ def set_property(self, property, value): ("clang_getNumDiagnosticsInSet", [c_object_p], c_uint), ("clang_getNumElements", [Type], c_longlong), ("clang_getNumOverloadedDecls", [Cursor], c_uint), + ("clang_getOffsetOfBase", [Cursor, Cursor], c_longlong), ("clang_getOverloadedDecl", [Cursor, c_uint], Cursor), ("clang_getPointeeType", [Type], Type), ("clang_getRange", [SourceLocation, SourceLocation], SourceRange), @@ -3992,6 +4016,7 @@ def set_property(self, property, value): [TranslationUnit, SourceRange, POINTER(POINTER(Token)), POINTER(c_uint)], ), ("clang_visitChildren", [Cursor, cursor_visit_callback, py_object], c_uint), + ("clang_visitCXXBaseClasses", [Type, fields_visit_callback, py_object], c_uint), ("clang_Cursor_getNumArguments", [Cursor], c_int), ("clang_Cursor_getArgument", [Cursor, c_uint], Cursor), ("clang_Cursor_getNumTemplateArguments", [Cursor], c_int), diff --git a/clang/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py index f39da8b5faf29..9bac33f3041f4 100644 --- a/clang/bindings/python/tests/cindex/test_type.py +++ b/clang/bindings/python/tests/cindex/test_type.py @@ -534,3 +534,28 @@ def test_pretty(self): self.assertEqual(f.type.get_canonical().pretty_printed(pp), "X") pp.set_property(PrintingPolicyProperty.SuppressTagKeyword, False) self.assertEqual(f.type.get_canonical().pretty_printed(pp), "struct X") + + def test_base_classes(self): + source = """ + class A { int a; }; + class B { int b; }; + class C { int c; }; + template + class Template : public A, public B, virtual C { + }; + Template instance; + int bar; + """ + tu = get_tu(source, lang="cpp", flags=["--target=x86_64-linux-gnu"]) + cursor = get_cursor(tu, "instance") + cursor_type = cursor.type + cursor_type_decl = cursor_type.get_declaration() + self.assertEqual(cursor.kind, CursorKind.VAR_DECL) + bases = list(cursor_type.get_bases()) + self.assertEqual(len(bases), 3) + self.assertFalse(bases[0].is_virtual_base()) + self.assertEqual(bases[0].get_base_offsetof(cursor_type_decl), 64) + self.assertFalse(bases[1].is_virtual_base()) + self.assertEqual(bases[1].get_base_offsetof(cursor_type_decl), 96) + self.assertTrue(bases[2].is_virtual_base()) + self.assertEqual(bases[2].get_base_offsetof(cursor_type_decl), 128) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 61aa955ca9b9d..c6bc95594f613 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -316,6 +316,8 @@ C++23 Feature Support C++20 Feature Support ^^^^^^^^^^^^^^^^^^^^^ +- Implemented module level lookup for C++20 modules. (#GH90154) + Resolutions to C++ Defect Reports ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1224,6 +1226,10 @@ libclang whether the first one comes strictly before the second in the source code. - Add ``clang_getTypePrettyPrinted``. It allows controlling the PrintingPolicy used to pretty-print a type. +- Added ``clang_visitCXXBaseClasses``, which allows visiting the base classes + of a class. +- Added ``clang_getOffsetOfBase``, which allows computing the offset of a base + class in a class's layout. Static Analyzer --------------- @@ -1371,6 +1377,12 @@ Python Binding Changes declaration is an anonymous union or anonymous struct. - Added ``Type.pretty_printed`, a binding for ``clang_getTypePrettyPrinted``, which allows changing the formatting of pretty-printed types. +- Added ``Cursor.is_virtual_base``, a binding for ``clang_isVirtualBase``, + which checks whether a base class is virtual. +- Added ``Type.get_bases``, a binding for ``clang_visitCXXBaseClasses``, which + allows visiting the base classes of a class. +- Added ``Cursor.get_base_offsetof``, a binding for ``clang_getOffsetOfBase``, + which allows computing the offset of a base class in a class's layout. OpenMP Support -------------- diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index ad64497ceb802..aac5d1fa8aa2e 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -3605,8 +3605,8 @@ CINDEX_LINKAGE enum CXTypeNullabilityKind clang_Type_getNullability(CXType T); /** * List the possible error codes for \c clang_Type_getSizeOf, - * \c clang_Type_getAlignOf, \c clang_Type_getOffsetOf and - * \c clang_Cursor_getOffsetOf. + * \c clang_Type_getAlignOf, \c clang_Type_getOffsetOf, + * \c clang_Cursor_getOffsetOf, and \c clang_getOffsetOfBase. * * A value of this enumeration type can be returned if the target type is not * a valid argument to sizeof, alignof or offsetof. @@ -3771,6 +3771,15 @@ CINDEX_LINKAGE enum CXRefQualifierKind clang_Type_getCXXRefQualifier(CXType T); */ CINDEX_LINKAGE unsigned clang_isVirtualBase(CXCursor); +/** + * Returns the offset in bits of a CX_CXXBaseSpecifier relative to the parent + * class. + * + * Returns a small negative number if the offset cannot be computed. See + * CXTypeLayoutError for error codes. + */ +CINDEX_LINKAGE long long clang_getOffsetOfBase(CXCursor Parent, CXCursor Base); + /** * Represents the C++ access control level to a base class for a * cursor with kind CX_CXXBaseSpecifier. @@ -6648,6 +6657,29 @@ typedef enum CXVisitorResult (*CXFieldVisitor)(CXCursor C, CINDEX_LINKAGE unsigned clang_Type_visitFields(CXType T, CXFieldVisitor visitor, CXClientData client_data); +/** + * Visit the base classes of a type. + * + * This function visits all the direct base classes of a the given cursor, + * invoking the given \p visitor function with the cursors of each + * visited base. The traversal may be ended prematurely, if + * the visitor returns \c CXFieldVisit_Break. + * + * \param T the record type whose field may be visited. + * + * \param visitor the visitor function that will be invoked for each + * field of \p T. + * + * \param client_data pointer data supplied by the client, which will + * be passed to the visitor each time it is invoked. + * + * \returns a non-zero value if the traversal was terminated + * prematurely by the visitor returning \c CXFieldVisit_Break. + */ +CINDEX_LINKAGE unsigned clang_visitCXXBaseClasses(CXType T, + CXFieldVisitor visitor, + CXClientData client_data); + /** * Describes the kind of binary operators. */ diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 71ab9178509b2..91177c9a4b51f 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -836,6 +836,10 @@ class alignas(8) Decl { return isFromASTFile() ? getImportedOwningModule() : getLocalOwningModule(); } + /// Get the top level owning named module that owns this declaration if any. + /// \returns nullptr if the declaration is not owned by a named module. + Module *getTopLevelOwningNamedModule() const; + /// Get the module that owns this declaration for linkage purposes. /// There only ever is such a standard C++ module. Module *getOwningModuleForLinkage() const; @@ -2722,6 +2726,12 @@ class DeclContext { bool Deserialize = false) const; private: + /// Lookup all external visible declarations and the external declarations + /// within the same module specified by \c NamedModule. We can't + /// get it from \c this since the same declaration may be declared in + /// multiple modules. e.g., namespace. + lookup_result lookupImpl(DeclarationName Name, Module *NamedModule) const; + /// Whether this declaration context has had externally visible /// storage added since the last lookup. In this case, \c LookupPtr's /// invariant may not hold and needs to be fixed before we perform diff --git a/clang/include/clang/AST/ExternalASTMerger.h b/clang/include/clang/AST/ExternalASTMerger.h index ec4cfbe2175c0..46f187c5e0694 100644 --- a/clang/include/clang/AST/ExternalASTMerger.h +++ b/clang/include/clang/AST/ExternalASTMerger.h @@ -141,7 +141,8 @@ class ExternalASTMerger : public ExternalASTSource { /// Implementation of the ExternalASTSource API. bool FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) override; + DeclarationName Name, + Module *NamedModule) override; /// Implementation of the ExternalASTSource API. void diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h index 4d7ff822fceb7..ee4ad634977dc 100644 --- a/clang/include/clang/AST/ExternalASTSource.h +++ b/clang/include/clang/AST/ExternalASTSource.h @@ -51,6 +51,7 @@ class RecordDecl; class Selector; class Stmt; class TagDecl; +class Module; /// Abstract interface for external sources of AST nodes. /// @@ -145,12 +146,20 @@ class ExternalASTSource : public RefCountedBase { /// Find all declarations with the given name in the given context, /// and add them to the context by calling SetExternalVisibleDeclsForName /// or SetNoExternalVisibleDeclsForName. - /// \return \c true if any declarations might have been found, \c false if - /// we definitely have no declarations with tbis name. + /// \param DC the context for lookup. + /// \param Name the name of the declarations to find. + /// \param NamedModule find declarations visible to the given module + /// \c NamedModule . This may be different from owning module of \c DC since + /// there are declarations (e.g., namespace declaration) can appear in + /// multiple modules. + /// + /// \return \c true if any declarations might have been found, and \c false + /// if we definitely have no declarations with this name. /// /// The default implementation of this method is a no-op returning \c false. - virtual bool - FindExternalVisibleDeclsByName(const DeclContext *DC, DeclarationName Name); + virtual bool FindExternalVisibleDeclsByName(const DeclContext *DC, + DeclarationName Name, + Module *NamedModule); /// Load all the external specializations for the Decl \param D if \param /// OnlyPartial is false. Otherwise, load all the external **partial** diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 78677df578c4b..f0fbacccc97bb 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -7040,17 +7040,17 @@ class DependentNameType : public TypeWithKeyword, public llvm::FoldingSetNode { : TypeWithKeyword(Keyword, DependentName, CanonType, TypeDependence::DependentInstantiation | toTypeDependence(NNS->getDependence())), - NNS(NNS), Name(Name) {} + NNS(NNS), Name(Name) { + assert(NNS); + assert(Name); + } public: /// Retrieve the qualification on this type. NestedNameSpecifier *getQualifier() const { return NNS; } - /// Retrieve the type named by the typename specifier as an identifier. - /// - /// This routine will return a non-NULL identifier pointer when the - /// form of the original typename was terminated by an identifier, - /// e.g., "typename T::type". + /// Retrieve the identifier that terminates this type name. + /// For example, "type" in "typename T::type". const IdentifierInfo *getIdentifier() const { return Name; } diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index 80bce574a3b64..e6d1e1f888f25 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -72,6 +72,29 @@ enum ModuleHeaderMode { HeaderMode_System }; +/// Options for specifying CUID used by CUDA/HIP for uniquely identifying +/// compilation units. +class CUIDOptions { +public: + enum class Kind { Hash, Random, Fixed, None, Invalid }; + + CUIDOptions() = default; + CUIDOptions(const CUIDOptions &) = default; + CUIDOptions(llvm::opt::DerivedArgList &Args, const Driver &D); + + // Get the CUID for an input string + std::string getCUID(StringRef InputFile, + llvm::opt::DerivedArgList &Args) const; + + bool isEnabled() const { + return UseCUID != Kind::None && UseCUID != Kind::Invalid; + } + +private: + Kind UseCUID = Kind::None; + StringRef FixedCUID; +}; + /// Driver - Encapsulate logic for constructing compilation processes /// from a set of gcc-driver-like command line arguments. class Driver { @@ -119,6 +142,9 @@ class Driver { /// LTO mode selected via -f(no-offload-)?lto(=.*)? options. LTOKind OffloadLTOMode; + /// Options for CUID + CUIDOptions CUIDOpts; + public: enum OpenMPRuntimeKind { /// An unknown OpenMP runtime. We can't generate effective OpenMP code @@ -501,10 +527,11 @@ class Driver { /// \param C - The compilation that is being built. /// \param Args - The input arguments. /// \param Input - The input type and arguments + /// \param CUID - The CUID for \p Input /// \param HostAction - The host action used in the offloading toolchain. Action *BuildOffloadingActions(Compilation &C, llvm::opt::DerivedArgList &Args, - const InputTy &Input, + const InputTy &Input, StringRef CUID, Action *HostAction) const; /// Returns the set of bound architectures active for this offload kind. @@ -728,6 +755,9 @@ class Driver { /// Get the specific kind of offload LTO being performed. LTOKind getOffloadLTOMode() const { return OffloadLTOMode; } + /// Get the CUID option. + const CUIDOptions &getCUIDOpts() const { return CUIDOpts; } + private: /// Tries to load options from configuration files. diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h index dbed70f4f9008..0a533ed2804e2 100644 --- a/clang/include/clang/Driver/Multilib.h +++ b/clang/include/clang/Driver/Multilib.h @@ -101,6 +101,30 @@ class Multilib { raw_ostream &operator<<(raw_ostream &OS, const Multilib &M); +namespace custom_flag { +struct Declaration; + +struct ValueDetail { + std::string Name; + std::optional> MacroDefines; + Declaration *Decl; +}; + +struct Declaration { + std::string Name; + SmallVector ValueList; + std::optional DefaultValueIdx; + + Declaration() = default; + Declaration(const Declaration &); + Declaration(Declaration &&); + Declaration &operator=(const Declaration &); + Declaration &operator=(Declaration &&); +}; + +static constexpr StringRef Prefix = "-fmultilib-flag="; +} // namespace custom_flag + /// See also MultilibSetBuilder for combining multilibs into a set. class MultilibSet { public: @@ -120,15 +144,18 @@ class MultilibSet { private: multilib_list Multilibs; - std::vector FlagMatchers; + SmallVector FlagMatchers; + SmallVector CustomFlagDecls; IncludeDirsFunc IncludeCallback; IncludeDirsFunc FilePathsCallback; public: MultilibSet() = default; MultilibSet(multilib_list &&Multilibs, - std::vector &&FlagMatchers = {}) - : Multilibs(Multilibs), FlagMatchers(FlagMatchers) {} + SmallVector &&FlagMatchers = {}, + SmallVector &&CustomFlagDecls = {}) + : Multilibs(std::move(Multilibs)), FlagMatchers(std::move(FlagMatchers)), + CustomFlagDecls(std::move(CustomFlagDecls)) {} const multilib_list &getMultilibs() { return Multilibs; } diff --git a/clang/include/clang/Sema/MultiplexExternalSemaSource.h b/clang/include/clang/Sema/MultiplexExternalSemaSource.h index 0c92c52854c9e..08d6143f7caaf 100644 --- a/clang/include/clang/Sema/MultiplexExternalSemaSource.h +++ b/clang/include/clang/Sema/MultiplexExternalSemaSource.h @@ -95,7 +95,8 @@ class MultiplexExternalSemaSource : public ExternalSemaSource { /// Find all declarations with the given name in the /// given context. bool FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) override; + DeclarationName Name, + Module *NamedModule) override; bool LoadExternalSpecializations(const Decl *D, bool OnlyPartial) override; diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index aac165130b719..40dae25f7b54b 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -738,6 +738,8 @@ enum ASTRecordTypes { CXX_ADDED_TEMPLATE_SPECIALIZATION = 74, CXX_ADDED_TEMPLATE_PARTIAL_SPECIALIZATION = 75, + + UPDATE_MODULE_LOCAL_VISIBLE = 76, }; /// Record types used within a source manager block. @@ -1334,6 +1336,10 @@ enum DeclCode { /// into a DeclContext via DeclContext::lookup. DECL_CONTEXT_VISIBLE, + /// A record containing the set of declarations that are + /// only visible from DeclContext in the same module. + DECL_CONTEXT_MODULE_LOCAL_VISIBLE, + /// A LabelDecl record. DECL_LABEL, diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 9f978762a6fb6..ea12adaec3ee8 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -353,6 +353,7 @@ class ASTIdentifierLookupTrait; /// The on-disk hash table(s) used for DeclContext name lookup. struct DeclContextLookupTable; +struct ModuleLocalLookupTable; /// The on-disk hash table(s) used for specialization decls. struct LazySpecializationInfoLookupTable; @@ -523,9 +524,14 @@ class ASTReader /// in the chain. DeclUpdateOffsetsMap DeclUpdateOffsets; + struct LookupBlockOffsets { + uint64_t LexicalOffset; + uint64_t VisibleOffset; + uint64_t ModuleLocalOffset; + }; + using DelayedNamespaceOffsetMapTy = - llvm::DenseMap>; + llvm::DenseMap; /// Mapping from global declaration IDs to the lexical and visible block /// offset for delayed namespace in reduced BMI. @@ -631,6 +637,9 @@ class ASTReader /// Map from a DeclContext to its lookup tables. llvm::DenseMap Lookups; + llvm::DenseMap + ModuleLocalLookups; using SpecLookupTableTy = llvm::DenseMap PendingVisibleUpdates; + llvm::DenseMap + PendingModuleLocalVisibleUpdates; using SpecializationsUpdate = SmallVector; using SpecializationsUpdateMap = @@ -696,7 +707,8 @@ class ASTReader /// Read the record that describes the visible contents of a DC. bool ReadVisibleDeclContextStorage(ModuleFile &M, llvm::BitstreamCursor &Cursor, - uint64_t Offset, GlobalDeclID ID); + uint64_t Offset, GlobalDeclID ID, + bool IsModuleLocal); bool ReadSpecializations(ModuleFile &M, llvm::BitstreamCursor &Cursor, uint64_t Offset, Decl *D, bool IsPartial); @@ -1132,6 +1144,10 @@ class ASTReader /// Number of visible decl contexts read/total. unsigned NumVisibleDeclContextsRead = 0, TotalVisibleDeclContexts = 0; + /// Number of module local visible decl contexts read/total. + unsigned NumModuleLocalVisibleDeclContexts = 0, + TotalModuleLocalVisibleDeclContexts = 0; + /// Total size of modules, in bits, currently loaded uint64_t TotalModulesSizeInBits = 0; @@ -1444,6 +1460,9 @@ class ASTReader const serialization::reader::DeclContextLookupTable * getLoadedLookupTables(DeclContext *Primary) const; + const serialization::reader::ModuleLocalLookupTable * + getModuleLocalLookupTables(DeclContext *Primary) const; + /// Get the loaded specializations lookup tables for \p D, /// if any. serialization::reader::LazySpecializationInfoLookupTable * @@ -2119,7 +2138,8 @@ class ASTReader /// The current implementation of this method just loads the entire /// lookup table as unmaterialized references. bool FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) override; + DeclarationName Name, + Module *NamedModule) override; /// Read all of the declarations lexically stored in a /// declaration context. @@ -2607,6 +2627,10 @@ inline bool shouldSkipCheckingODR(const Decl *D) { (D->isFromGlobalModule() || D->isFromHeaderUnit()); } +/// Calculate a hash value for the primary module name of the given module. +/// \returns std::nullopt if M is not a C++ standard module. +std::optional getPrimaryModuleHash(const Module *M); + } // namespace clang #endif // LLVM_CLANG_SERIALIZATION_ASTREADER_H diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h index adb7cce522a80..53b09cc914392 100644 --- a/clang/include/clang/Serialization/ASTWriter.h +++ b/clang/include/clang/Serialization/ASTWriter.h @@ -492,6 +492,10 @@ class ASTWriter : public ASTDeserializationListener, /// file. unsigned NumVisibleDeclContexts = 0; + /// The number of module local visible declcontexts written to the AST + /// file. + unsigned NumModuleLocalDeclContexts = 0; + /// A mapping from each known submodule to its ID number, which will /// be a positive integer. llvm::DenseMap SubmoduleIDs; @@ -587,11 +591,15 @@ class ASTWriter : public ASTDeserializationListener, uint64_t WriteSpecializationInfoLookupTable( const NamedDecl *D, llvm::SmallVectorImpl &Specializations, bool IsPartial); - void GenerateNameLookupTable(ASTContext &Context, const DeclContext *DC, - llvm::SmallVectorImpl &LookupTable); + void + GenerateNameLookupTable(ASTContext &Context, const DeclContext *DC, + llvm::SmallVectorImpl &LookupTable, + llvm::SmallVectorImpl &ModuleLocalLookupTable); uint64_t WriteDeclContextLexicalBlock(ASTContext &Context, const DeclContext *DC); - uint64_t WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC); + void WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC, + uint64_t &VisibleBlockOffset, + uint64_t &ModuleLocalBlockOffset); void WriteTypeDeclOffsets(); void WriteFileDeclIDsMap(); void WriteComments(ASTContext &Context); @@ -624,7 +632,9 @@ class ASTWriter : public ASTDeserializationListener, unsigned DeclParmVarAbbrev = 0; unsigned DeclContextLexicalAbbrev = 0; unsigned DeclContextVisibleLookupAbbrev = 0; + unsigned DeclModuleLocalVisibleLookupAbbrev = 0; unsigned UpdateVisibleAbbrev = 0; + unsigned ModuleLocalUpdateVisibleAbbrev = 0; unsigned DeclRecordAbbrev = 0; unsigned DeclTypedefAbbrev = 0; unsigned DeclVarAbbrev = 0; diff --git a/clang/include/clang/Tooling/Tooling.h b/clang/include/clang/Tooling/Tooling.h index 070706e8fa6d1..200fb30839a95 100644 --- a/clang/include/clang/Tooling/Tooling.h +++ b/clang/include/clang/Tooling/Tooling.h @@ -223,7 +223,11 @@ buildASTFromCode(StringRef Code, StringRef FileName = "input.cc", /// \param PCHContainerOps The PCHContainerOperations for loading and creating /// clang modules. /// -/// \param Adjuster A function to filter the command line arguments as specified. +/// \param Adjuster A function to filter the command line arguments as +/// specified. +/// +/// \param BaseFS FileSystem for managing and looking up files. +/// VirtualMappedFiles takes precedence. /// /// \return The resulting AST or null if an error occurred. std::unique_ptr buildASTFromCodeWithArgs( @@ -233,7 +237,9 @@ std::unique_ptr buildASTFromCodeWithArgs( std::make_shared(), ArgumentsAdjuster Adjuster = getClangStripDependencyFileAdjuster(), const FileContentMappings &VirtualMappedFiles = FileContentMappings(), - DiagnosticConsumer *DiagConsumer = nullptr); + DiagnosticConsumer *DiagConsumer = nullptr, + IntrusiveRefCntPtr BaseFS = + llvm::vfs::getRealFileSystem()); /// Utility to run a FrontendAction in a single clang invocation. class ToolInvocation { diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index dec4c7221bc77..0669aa1b809c3 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -6376,7 +6376,7 @@ ExpectedDecl ASTNodeImporter::VisitClassTemplateSpecializationDecl( D2->setTemplateSpecializationKind(D->getTemplateSpecializationKind()); if (auto P = D->getInstantiatedFrom()) { - if (auto *CTD = P.dyn_cast()) { + if (auto *CTD = dyn_cast(P)) { if (auto CTDorErr = import(CTD)) D2->setInstantiationOf(*CTDorErr); } else { diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index a5dfaaf319655..4bfb80589620c 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -2141,6 +2141,16 @@ bool Compiler::VisitUnaryExprOrTypeTraitExpr( return this->emitConst(ASTCtx.toCharUnitsFromBits(Bits).getQuantity(), E); } + if (Kind == UETT_PtrAuthTypeDiscriminator) { + if (E->getArgumentType()->isDependentType()) + return this->emitInvalid(E); + + return this->emitConst( + const_cast(ASTCtx).getPointerAuthTypeDiscriminator( + E->getArgumentType()), + E); + } + return false; } @@ -4805,12 +4815,7 @@ template bool Compiler::VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *E) { SourceLocScope SLS(this, E); - const Expr *SubExpr = E->getExpr(); - if (std::optional T = classify(E->getExpr())) - return this->visit(SubExpr); - - assert(Initializing); - return this->visitInitializer(SubExpr); + return this->delegate(E->getExpr()); } template diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp index 496c1dcef59b5..1aba778eaf7b9 100644 --- a/clang/lib/AST/ByteCode/Disasm.cpp +++ b/clang/lib/AST/ByteCode/Disasm.cpp @@ -368,10 +368,10 @@ LLVM_DUMP_METHOD void EvaluationResult::dump() const { case LValue: { assert(Source); QualType SourceType; - if (const auto *D = Source.dyn_cast()) { + if (const auto *D = dyn_cast(Source)) { if (const auto *VD = dyn_cast(D)) SourceType = VD->getType(); - } else if (const auto *E = Source.dyn_cast()) { + } else if (const auto *E = dyn_cast(Source)) { SourceType = E->getType(); } diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index fb701f76231bc..42daaa4f3dcc3 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1850,15 +1850,28 @@ void DeclContext::buildLookupImpl(DeclContext *DCtx, bool Internal) { } } +Module *Decl::getTopLevelOwningNamedModule() const { + if (getOwningModule() && + getOwningModule()->getTopLevelModule()->isNamedModule()) + return getOwningModule()->getTopLevelModule(); + + return nullptr; +} + DeclContext::lookup_result DeclContext::lookup(DeclarationName Name) const { + return lookupImpl(Name, cast(this)->getTopLevelOwningNamedModule()); +} + +DeclContext::lookup_result DeclContext::lookupImpl(DeclarationName Name, + Module *NamedModule) const { // For transparent DeclContext, we should lookup in their enclosing context. if (getDeclKind() == Decl::LinkageSpec || getDeclKind() == Decl::Export) - return getParent()->lookup(Name); + return getParent()->lookupImpl(Name, NamedModule); const DeclContext *PrimaryContext = getPrimaryContext(); if (PrimaryContext != this) - return PrimaryContext->lookup(Name); + return PrimaryContext->lookupImpl(Name, NamedModule); // If we have an external source, ensure that any later redeclarations of this // context have been loaded, since they may add names to the result of this @@ -1889,7 +1902,8 @@ DeclContext::lookup(DeclarationName Name) const { if (!R.second && !R.first->second.hasExternalDecls()) return R.first->second.getLookupResult(); - if (Source->FindExternalVisibleDeclsByName(this, Name) || !R.second) { + if (Source->FindExternalVisibleDeclsByName(this, Name, NamedModule) || + !R.second) { if (StoredDeclsMap *Map = LookupPtr) { StoredDeclsMap::iterator I = Map->find(Name); if (I != Map->end()) @@ -2115,7 +2129,8 @@ void DeclContext::makeDeclVisibleInContextImpl(NamedDecl *D, bool Internal) { if (ExternalASTSource *Source = getParentASTContext().getExternalSource()) if (hasExternalVisibleStorage() && Map->find(D->getDeclName()) == Map->end()) - Source->FindExternalVisibleDeclsByName(this, D->getDeclName()); + Source->FindExternalVisibleDeclsByName( + this, D->getDeclName(), D->getTopLevelOwningNamedModule()); // Insert this declaration into the map. StoredDeclsList &DeclNameEntries = (*Map)[D->getDeclName()]; diff --git a/clang/lib/AST/ExternalASTMerger.cpp b/clang/lib/AST/ExternalASTMerger.cpp index 8bad3b36244e1..a33f6e3447679 100644 --- a/clang/lib/AST/ExternalASTMerger.cpp +++ b/clang/lib/AST/ExternalASTMerger.cpp @@ -276,8 +276,8 @@ bool ExternalASTMerger::HasImporterForOrigin(ASTContext &OriginContext) { template void ExternalASTMerger::ForEachMatchingDC(const DeclContext *DC, CallbackType Callback) { - if (Origins.count(DC)) { - ExternalASTMerger::DCOrigin Origin = Origins[DC]; + if (auto It = Origins.find(DC); It != Origins.end()) { + ExternalASTMerger::DCOrigin Origin = It->second; LazyASTImporter &Importer = LazyImporterForOrigin(*this, *Origin.AST); Callback(Importer, Importer.GetReverse(), Origin.DC); } else { @@ -472,7 +472,8 @@ static bool importSpecializationsIfNeeded(Decl *D, ASTImporter *Importer) { } bool ExternalASTMerger::FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) { + DeclarationName Name, + Module *NamedModule) { llvm::SmallVector Decls; llvm::SmallVector Candidates; diff --git a/clang/lib/AST/ExternalASTSource.cpp b/clang/lib/AST/ExternalASTSource.cpp index 543846c0093af..4a29f4944f73c 100644 --- a/clang/lib/AST/ExternalASTSource.cpp +++ b/clang/lib/AST/ExternalASTSource.cpp @@ -90,9 +90,9 @@ ExternalASTSource::GetExternalCXXBaseSpecifiers(uint64_t Offset) { return nullptr; } -bool -ExternalASTSource::FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) { +bool ExternalASTSource::FindExternalVisibleDeclsByName(const DeclContext *DC, + DeclarationName Name, + Module *NamedModule) { return false; } diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp index 61ee26d886383..0fd5433a76402 100644 --- a/clang/lib/Basic/Targets/ARM.cpp +++ b/clang/lib/Basic/Targets/ARM.cpp @@ -617,7 +617,8 @@ bool ARMTargetInfo::handleTargetFeatures(std::vector &Features, case 6: if (ArchProfile == llvm::ARM::ProfileKind::M) LDREX = 0; - else if (ArchKind == llvm::ARM::ArchKind::ARMV6K) + else if (ArchKind == llvm::ARM::ArchKind::ARMV6K || + ArchKind == llvm::ARM::ArchKind::ARMV6KZ) LDREX = LDREX_D | LDREX_W | LDREX_H | LDREX_B; else LDREX = LDREX_W; diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 79e6bf3d24dff..3951ad01497cc 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1032,7 +1032,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline( static_assert(SanitizerKind::SO_LocalBounds <= std::numeric_limits< decltype(Options.GuardKind)::value_type>::max(), - "Update type of llvm.allow.ubsan.check."); + "Update type of llvm.allow.ubsan.check to represent " + "SanitizerKind::SO_LocalBounds."); Options.GuardKind = SanitizerKind::SO_LocalBounds; } Options.Merge = diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 244e3066f8fe4..ddcb04d53661d 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -7768,7 +7768,7 @@ class MappableExprsHandler { &Data : RecordLayout) { if (Data.isNull()) continue; - if (const auto *Base = Data.dyn_cast()) + if (const auto *Base = dyn_cast(Data)) getPlainLayout(Base, Layout, /*AsBase=*/true); else Layout.push_back(cast(Data)); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 9a947f32283c3..7767c81d654dc 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -197,6 +197,50 @@ std::string Driver::GetResourcesPath(StringRef BinaryPath) { return std::string(P); } +CUIDOptions::CUIDOptions(llvm::opt::DerivedArgList &Args, const Driver &D) + : UseCUID(Kind::Hash) { + if (Arg *A = Args.getLastArg(options::OPT_fuse_cuid_EQ)) { + StringRef UseCUIDStr = A->getValue(); + UseCUID = llvm::StringSwitch(UseCUIDStr) + .Case("hash", Kind::Hash) + .Case("random", Kind::Random) + .Case("none", Kind::None) + .Default(Kind::Invalid); + if (UseCUID == Kind::Invalid) + D.Diag(clang::diag::err_drv_invalid_value) + << A->getAsString(Args) << UseCUIDStr; + } + + FixedCUID = Args.getLastArgValue(options::OPT_cuid_EQ); + if (!FixedCUID.empty()) + UseCUID = Kind::Fixed; +} + +std::string CUIDOptions::getCUID(StringRef InputFile, + llvm::opt::DerivedArgList &Args) const { + std::string CUID = FixedCUID.str(); + if (CUID.empty()) { + if (UseCUID == Kind::Random) + CUID = llvm::utohexstr(llvm::sys::Process::GetRandomNumber(), + /*LowerCase=*/true); + else if (UseCUID == Kind::Hash) { + llvm::MD5 Hasher; + llvm::MD5::MD5Result Hash; + SmallString<256> RealPath; + llvm::sys::fs::real_path(InputFile, RealPath, + /*expand_tilde=*/true); + Hasher.update(RealPath); + for (auto *A : Args) { + if (A->getOption().matches(options::OPT_INPUT)) + continue; + Hasher.update(A->getAsString(Args)); + } + Hasher.final(Hash); + CUID = llvm::utohexstr(Hash.low(), /*LowerCase=*/true); + } + } + return CUID; +} Driver::Driver(StringRef ClangExecutable, StringRef TargetTriple, DiagnosticsEngine &Diags, std::string Title, IntrusiveRefCntPtr VFS) @@ -875,6 +919,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, C.addOffloadDeviceToolChain(HIPTC, OFK); } + if (IsCuda || IsHIP) + CUIDOpts = CUIDOptions(C.getArgs(), *this); + // // OpenMP // @@ -3161,19 +3208,15 @@ class OffloadingActionBuilder final { /// Default GPU architecture if there's no one specified. OffloadArch DefaultOffloadArch = OffloadArch::UNKNOWN; - /// Method to generate compilation unit ID specified by option - /// '-fuse-cuid='. - enum UseCUIDKind { CUID_Hash, CUID_Random, CUID_None, CUID_Invalid }; - UseCUIDKind UseCUID = CUID_Hash; - - /// Compilation unit ID specified by option '-cuid='. - StringRef FixedCUID; + /// Compilation unit ID specified by option '-fuse-cuid=' or'-cuid='. + const CUIDOptions &CUIDOpts; public: CudaActionBuilderBase(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs, Action::OffloadKind OFKind) - : DeviceActionBuilder(C, Args, Inputs, OFKind) { + : DeviceActionBuilder(C, Args, Inputs, OFKind), + CUIDOpts(C.getDriver().getCUIDOpts()) { CompileDeviceOnly = C.getDriver().offloadDeviceOnly(); Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, @@ -3204,28 +3247,8 @@ class OffloadingActionBuilder final { // Set the flag to true, so that the builder acts on the current input. IsActive = true; - std::string CUID = FixedCUID.str(); - if (CUID.empty()) { - if (UseCUID == CUID_Random) - CUID = llvm::utohexstr(llvm::sys::Process::GetRandomNumber(), - /*LowerCase=*/true); - else if (UseCUID == CUID_Hash) { - llvm::MD5 Hasher; - llvm::MD5::MD5Result Hash; - SmallString<256> RealPath; - llvm::sys::fs::real_path(IA->getInputArg().getValue(), RealPath, - /*expand_tilde=*/true); - Hasher.update(RealPath); - for (auto *A : Args) { - if (A->getOption().matches(options::OPT_INPUT)) - continue; - Hasher.update(A->getAsString(Args)); - } - Hasher.final(Hash); - CUID = llvm::utohexstr(Hash.low(), /*LowerCase=*/true); - } - } - IA->setId(CUID); + if (CUIDOpts.isEnabled()) + IA->setId(CUIDOpts.getCUID(IA->getInputArg().getValue(), Args)); if (CompileHostOnly) return ABRT_Success; @@ -3351,21 +3374,6 @@ class OffloadingActionBuilder final { CompileHostOnly = C.getDriver().offloadHostOnly(); EmitLLVM = Args.getLastArg(options::OPT_emit_llvm); EmitAsm = Args.getLastArg(options::OPT_S); - FixedCUID = Args.getLastArgValue(options::OPT_cuid_EQ); - if (Arg *A = Args.getLastArg(options::OPT_fuse_cuid_EQ)) { - StringRef UseCUIDStr = A->getValue(); - UseCUID = llvm::StringSwitch(UseCUIDStr) - .Case("hash", CUID_Hash) - .Case("random", CUID_Random) - .Case("none", CUID_None) - .Default(CUID_Invalid); - if (UseCUID == CUID_Invalid) { - C.getDriver().Diag(diag::err_drv_invalid_value) - << A->getAsString(Args) << UseCUIDStr; - C.setContainsError(); - return true; - } - } // --offload and --offload-arch options are mutually exclusive. if (Args.hasArgNoClaim(options::OPT_offload_EQ) && @@ -4366,6 +4374,12 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, // Build the pipeline for this file. Action *Current = C.MakeAction(*InputArg, InputType); + std::string CUID; + if (CUIDOpts.isEnabled() && types::isSrcFile(InputType)) { + CUID = CUIDOpts.getCUID(InputArg->getValue(), Args); + cast(Current)->setId(CUID); + } + // Use the current host action in any of the offloading actions, if // required. if (!UseNewOffloadingDriver) @@ -4429,7 +4443,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args, // Try to build the offloading actions and add the result as a dependency // to the host. if (UseNewOffloadingDriver) - Current = BuildOffloadingActions(C, Args, I, Current); + Current = BuildOffloadingActions(C, Args, I, CUID, Current); // Use the current host action in any of the offloading actions, if // required. else if (OffloadBuilder->addHostDependenceToDeviceActions(Current, @@ -4766,7 +4780,7 @@ Driver::getOffloadArchs(Compilation &C, const llvm::opt::DerivedArgList &Args, Action *Driver::BuildOffloadingActions(Compilation &C, llvm::opt::DerivedArgList &Args, - const InputTy &Input, + const InputTy &Input, StringRef CUID, Action *HostAction) const { // Don't build offloading actions if explicitly disabled or we do not have a // valid source input and compile action to embed it in. If preprocessing only @@ -4807,13 +4821,13 @@ Action *Driver::BuildOffloadingActions(Compilation &C, llvm::DenseSet Arches = getOffloadArchs(C, Args, Kind, TC); SmallVector Sorted(Arches.begin(), Arches.end()); llvm::sort(Sorted); - for (StringRef Arch : Sorted) + for (StringRef Arch : Sorted) { TCAndArchs.push_back(std::make_pair(TC, Arch)); + DeviceActions.push_back( + C.MakeAction(*InputArg, InputType, CUID)); + } } - for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I) - DeviceActions.push_back(C.MakeAction(*InputArg, InputType)); - if (DeviceActions.empty()) return HostAction; @@ -6501,9 +6515,24 @@ std::string Driver::GetStdModuleManifestPath(const Compilation &C, return evaluate("libc++.a").value_or(error); } - case ToolChain::CST_Libstdcxx: - // libstdc++ does not provide Standard library modules yet. - return error; + case ToolChain::CST_Libstdcxx: { + auto evaluate = [&](const char *library) -> std::optional { + std::string lib = GetFilePath(library, TC); + + SmallString<128> path(lib.begin(), lib.end()); + llvm::sys::path::remove_filename(path); + llvm::sys::path::append(path, "libstdc++.modules.json"); + if (TC.getVFS().exists(path)) + return static_cast(path); + + return {}; + }; + + if (std::optional result = evaluate("libstdc++.so"); result) + return *result; + + return evaluate("libstdc++.a").value_or(error); + } } return error; diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp index 0207e0f2eb2de..ccf747e90cb2c 100644 --- a/clang/lib/Driver/Multilib.cpp +++ b/clang/lib/Driver/Multilib.cpp @@ -10,6 +10,7 @@ #include "clang/Basic/LLVM.h" #include "clang/Driver/Driver.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" @@ -201,13 +202,20 @@ struct MultilibGroupSerialization { struct MultilibSetSerialization { llvm::VersionTuple MultilibVersion; - std::vector Groups; - std::vector Multilibs; - std::vector FlagMatchers; + SmallVector Groups; + SmallVector Multilibs; + SmallVector FlagMatchers; + SmallVector CustomFlagDeclarations; }; } // end anonymous namespace +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization) +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization) +LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher) +LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::ValueDetail) +LLVM_YAML_IS_SEQUENCE_VECTOR(custom_flag::Declaration) + template <> struct llvm::yaml::MappingTraits { static void mapping(llvm::yaml::IO &io, MultilibSerialization &V) { io.mapOptional("Dir", V.Dir); @@ -255,11 +263,61 @@ template <> struct llvm::yaml::MappingTraits { } }; +template <> +struct llvm::yaml::MappingContextTraits> { + static void mapping(llvm::yaml::IO &io, custom_flag::ValueDetail &V, + llvm::SmallSet &) { + io.mapRequired("Name", V.Name); + io.mapOptional("MacroDefines", V.MacroDefines); + } + static std::string validate(IO &io, custom_flag::ValueDetail &V, + llvm::SmallSet &NameSet) { + if (V.Name.empty()) + return "custom flag value requires a name"; + if (!NameSet.insert(V.Name).second) + return "duplicate custom flag value name: \"" + V.Name + "\""; + return {}; + } +}; + +template <> +struct llvm::yaml::MappingContextTraits> { + static void mapping(llvm::yaml::IO &io, custom_flag::Declaration &V, + llvm::SmallSet &NameSet) { + io.mapRequired("Name", V.Name); + io.mapRequired("Values", V.ValueList, NameSet); + std::string DefaultValueName; + io.mapRequired("Default", DefaultValueName); + + for (auto [Idx, Value] : llvm::enumerate(V.ValueList)) { + Value.Decl = &V; + if (Value.Name == DefaultValueName) { + assert(!V.DefaultValueIdx); + V.DefaultValueIdx = Idx; + } + } + } + static std::string validate(IO &io, custom_flag::Declaration &V, + llvm::SmallSet &) { + if (V.Name.empty()) + return "custom flag requires a name"; + if (V.ValueList.empty()) + return "custom flag must have at least one value"; + if (!V.DefaultValueIdx) + return "custom flag must have a default value"; + return {}; + } +}; + template <> struct llvm::yaml::MappingTraits { static void mapping(llvm::yaml::IO &io, MultilibSetSerialization &M) { io.mapRequired("MultilibVersion", M.MultilibVersion); io.mapRequired("Variants", M.Multilibs); io.mapOptional("Groups", M.Groups); + llvm::SmallSet NameSet; + io.mapOptionalWithContext("Flags", M.CustomFlagDeclarations, NameSet); io.mapOptional("Mappings", M.FlagMatchers); } static std::string validate(IO &io, MultilibSetSerialization &M) { @@ -288,10 +346,6 @@ template <> struct llvm::yaml::MappingTraits { } }; -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSerialization) -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibGroupSerialization) -LLVM_YAML_IS_SEQUENCE_VECTOR(MultilibSet::FlagMatcher) - llvm::ErrorOr MultilibSet::parseYaml(llvm::MemoryBufferRef Input, llvm::SourceMgr::DiagHandlerTy DiagHandler, @@ -319,7 +373,8 @@ MultilibSet::parseYaml(llvm::MemoryBufferRef Input, } } - return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers)); + return MultilibSet(std::move(Multilibs), std::move(MS.FlagMatchers), + std::move(MS.CustomFlagDeclarations)); } LLVM_DUMP_METHOD void MultilibSet::dump() const { @@ -335,3 +390,41 @@ raw_ostream &clang::driver::operator<<(raw_ostream &OS, const MultilibSet &MS) { MS.print(OS); return OS; } + +namespace clang::driver::custom_flag { +Declaration::Declaration(const Declaration &Other) + : Name(Other.Name), ValueList(Other.ValueList), + DefaultValueIdx(Other.DefaultValueIdx) { + for (ValueDetail &Detail : ValueList) + Detail.Decl = this; +} + +Declaration::Declaration(Declaration &&Other) + : Name(std::move(Other.Name)), ValueList(std::move(Other.ValueList)), + DefaultValueIdx(std::move(Other.DefaultValueIdx)) { + for (ValueDetail &Detail : ValueList) + Detail.Decl = this; +} + +Declaration &Declaration::operator=(const Declaration &Other) { + if (this == &Other) + return *this; + Name = Other.Name; + ValueList = Other.ValueList; + DefaultValueIdx = Other.DefaultValueIdx; + for (ValueDetail &Detail : ValueList) + Detail.Decl = this; + return *this; +} + +Declaration &Declaration::operator=(Declaration &&Other) { + if (this == &Other) + return *this; + Name = std::move(Other.Name); + ValueList = std::move(Other.ValueList); + DefaultValueIdx = std::move(Other.DefaultValueIdx); + for (ValueDetail &Detail : ValueList) + Detail.Decl = this; + return *this; +} +} // namespace clang::driver::custom_flag diff --git a/clang/lib/Interpreter/CodeCompletion.cpp b/clang/lib/Interpreter/CodeCompletion.cpp index bbc8830d76bc0..9092d4705ca58 100644 --- a/clang/lib/Interpreter/CodeCompletion.cpp +++ b/clang/lib/Interpreter/CodeCompletion.cpp @@ -228,7 +228,8 @@ class ExternalSource : public clang::ExternalASTSource { ExternalSource(ASTContext &ChildASTCtxt, FileManager &ChildFM, ASTContext &ParentASTCtxt, FileManager &ParentFM); bool FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) override; + DeclarationName Name, + Module *NamedModule) override; void completeVisibleDeclsMap(const clang::DeclContext *childDeclContext) override; }; @@ -271,7 +272,8 @@ ExternalSource::ExternalSource(ASTContext &ChildASTCtxt, FileManager &ChildFM, } bool ExternalSource::FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) { + DeclarationName Name, + Module *NamedModule) { IdentifierTable &ParentIdTable = ParentASTCtxt.Idents; diff --git a/clang/lib/Sema/MultiplexExternalSemaSource.cpp b/clang/lib/Sema/MultiplexExternalSemaSource.cpp index 54944267b4868..c19a0f980c1e9 100644 --- a/clang/lib/Sema/MultiplexExternalSemaSource.cpp +++ b/clang/lib/Sema/MultiplexExternalSemaSource.cpp @@ -107,11 +107,12 @@ MultiplexExternalSemaSource::hasExternalDefinitions(const Decl *D) { return EK_ReplyHazy; } -bool MultiplexExternalSemaSource:: -FindExternalVisibleDeclsByName(const DeclContext *DC, DeclarationName Name) { +bool MultiplexExternalSemaSource::FindExternalVisibleDeclsByName( + const DeclContext *DC, DeclarationName Name, Module *NamedModule) { bool AnyDeclsFound = false; for (size_t i = 0; i < Sources.size(); ++i) - AnyDeclsFound |= Sources[i]->FindExternalVisibleDeclsByName(DC, Name); + AnyDeclsFound |= + Sources[i]->FindExternalVisibleDeclsByName(DC, Name, NamedModule); return AnyDeclsFound; } diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 1f398bb004fa3..8a848df70cc5a 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -121,8 +121,7 @@ class ResultBuilder { return; } - if (const NamedDecl *PrevND = - DeclOrVector.dyn_cast()) { + if (const NamedDecl *PrevND = dyn_cast(DeclOrVector)) { // 1 -> 2 elements: create the vector of results and push in the // existing declaration. DeclIndexPairVector *Vec = new DeclIndexPairVector; @@ -702,7 +701,7 @@ ResultBuilder::ShadowMapEntry::begin() const { if (DeclOrVector.isNull()) return iterator(); - if (const NamedDecl *ND = DeclOrVector.dyn_cast()) + if (const NamedDecl *ND = dyn_cast(DeclOrVector)) return iterator(ND, SingleDeclIndex); return iterator(cast(DeclOrVector)->begin()); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 704cb82b291cc..e0dd6039810cb 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -17310,7 +17310,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, return nullptr; if (EnumUnderlying) { EnumDecl *ED = cast(New); - if (TypeSourceInfo *TI = EnumUnderlying.dyn_cast()) + if (TypeSourceInfo *TI = dyn_cast(EnumUnderlying)) ED->setIntegerTypeSourceInfo(TI); else ED->setIntegerType(QualType(cast(EnumUnderlying), 0)); @@ -17943,7 +17943,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, if (EnumUnderlying) { EnumDecl *ED = cast(New); - if (TypeSourceInfo *TI = EnumUnderlying.dyn_cast()) + if (TypeSourceInfo *TI = dyn_cast(EnumUnderlying)) ED->setIntegerTypeSourceInfo(TI); else ED->setIntegerType(QualType(cast(EnumUnderlying), 0)); diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 7361cace49dd7..06853a227215e 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1235,7 +1235,7 @@ unsigned DeclarationNameKey::getHash() const { } ModuleFile * -ASTDeclContextNameLookupTrait::ReadFileRef(const unsigned char *&d) { +ASTDeclContextNameLookupTraitBase::ReadFileRef(const unsigned char *&d) { using namespace llvm::support; uint32_t ModuleFileID = @@ -1244,12 +1244,12 @@ ASTDeclContextNameLookupTrait::ReadFileRef(const unsigned char *&d) { } std::pair -ASTDeclContextNameLookupTrait::ReadKeyDataLength(const unsigned char *&d) { +ASTDeclContextNameLookupTraitBase::ReadKeyDataLength(const unsigned char *&d) { return readULEBKeyDataLength(d); } -ASTDeclContextNameLookupTrait::internal_key_type -ASTDeclContextNameLookupTrait::ReadKey(const unsigned char *d, unsigned) { +DeclarationNameKey +ASTDeclContextNameLookupTraitBase::ReadKeyBase(const unsigned char *&d) { using namespace llvm::support; auto Kind = (DeclarationName::NameKind)*d++; @@ -1283,10 +1283,13 @@ ASTDeclContextNameLookupTrait::ReadKey(const unsigned char *d, unsigned) { return DeclarationNameKey(Kind, Data); } -void ASTDeclContextNameLookupTrait::ReadDataInto(internal_key_type, - const unsigned char *d, - unsigned DataLen, - data_type_builder &Val) { +ASTDeclContextNameLookupTrait::internal_key_type +ASTDeclContextNameLookupTrait::ReadKey(const unsigned char *d, unsigned) { + return ReadKeyBase(d); +} + +void ASTDeclContextNameLookupTraitBase::ReadDataIntoImpl( + const unsigned char *d, unsigned DataLen, data_type_builder &Val) { using namespace llvm::support; for (unsigned NumDecls = DataLen / sizeof(DeclID); NumDecls; --NumDecls) { @@ -1296,6 +1299,47 @@ void ASTDeclContextNameLookupTrait::ReadDataInto(internal_key_type, } } +void ASTDeclContextNameLookupTrait::ReadDataInto(internal_key_type, + const unsigned char *d, + unsigned DataLen, + data_type_builder &Val) { + ReadDataIntoImpl(d, DataLen, Val); +} + +ModuleLocalNameLookupTrait::hash_value_type +ModuleLocalNameLookupTrait::ComputeHash(const internal_key_type &Key) { + llvm::FoldingSetNodeID ID; + ID.AddInteger(Key.first.getHash()); + ID.AddInteger(Key.second); + return ID.computeStableHash(); +} + +ModuleLocalNameLookupTrait::internal_key_type +ModuleLocalNameLookupTrait::GetInternalKey(const external_key_type &Key) { + DeclarationNameKey Name(Key.first); + + std::optional ModuleHash = getPrimaryModuleHash(Key.second); + if (!ModuleHash) + return {Name, 0}; + + return {Name, *ModuleHash}; +} + +ModuleLocalNameLookupTrait::internal_key_type +ModuleLocalNameLookupTrait::ReadKey(const unsigned char *d, unsigned) { + DeclarationNameKey Name = ReadKeyBase(d); + unsigned PrimaryModuleHash = + llvm::support::endian::readNext(d); + return {Name, PrimaryModuleHash}; +} + +void ModuleLocalNameLookupTrait::ReadDataInto(internal_key_type, + const unsigned char *d, + unsigned DataLen, + data_type_builder &Val) { + ReadDataIntoImpl(d, DataLen, Val); +} + ModuleFile * LazySpecializationInfoLookupTrait::ReadFileRef(const unsigned char *&d) { using namespace llvm::support; @@ -1383,8 +1427,8 @@ bool ASTReader::ReadLexicalDeclContextStorage(ModuleFile &M, bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M, BitstreamCursor &Cursor, - uint64_t Offset, - GlobalDeclID ID) { + uint64_t Offset, GlobalDeclID ID, + bool IsModuleLocal) { assert(Offset != 0); SavedStreamPosition SavedPosition(Cursor); @@ -1408,15 +1452,22 @@ bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M, return true; } unsigned RecCode = MaybeRecCode.get(); - if (RecCode != DECL_CONTEXT_VISIBLE) { + if (!IsModuleLocal && RecCode != DECL_CONTEXT_VISIBLE) { Error("Expected visible lookup table block"); return true; } + if (IsModuleLocal && RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) { + Error("Expected module local visible lookup table block"); + return true; + } // We can't safely determine the primary context yet, so delay attaching the // lookup table until we're done with recursive deserialization. auto *Data = (const unsigned char*)Blob.data(); - PendingVisibleUpdates[ID].push_back(UpdateData{&M, Data}); + if (!IsModuleLocal) + PendingVisibleUpdates[ID].push_back(UpdateData{&M, Data}); + else + PendingModuleLocalVisibleUpdates[ID].push_back(UpdateData{&M, Data}); return false; } @@ -3549,6 +3600,19 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, break; } + case UPDATE_MODULE_LOCAL_VISIBLE: { + unsigned Idx = 0; + GlobalDeclID ID = ReadDeclID(F, Record, Idx); + auto *Data = (const unsigned char *)Blob.data(); + PendingModuleLocalVisibleUpdates[ID].push_back(UpdateData{&F, Data}); + // If we've already loaded the decl, perform the updates when we finish + // loading this block. + if (Decl *D = GetExistingDecl(ID)) + PendingUpdateRecords.push_back( + PendingUpdateRecord(ID, D, /*JustLoaded=*/false)); + break; + } + case CXX_ADDED_TEMPLATE_SPECIALIZATION: { unsigned Idx = 0; GlobalDeclID ID = ReadDeclID(F, Record, Idx); @@ -3652,6 +3716,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, TotalNumMacros += Record[1]; TotalLexicalDeclContexts += Record[2]; TotalVisibleDeclContexts += Record[3]; + TotalModuleLocalVisibleDeclContexts += Record[4]; break; case UNUSED_FILESCOPED_DECLS: @@ -3937,7 +4002,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, break; case DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD: { - if (Record.size() % 3 != 0) + if (Record.size() % 4 != 0) return llvm::createStringError( std::errc::illegal_byte_sequence, "invalid DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD block in AST " @@ -3953,8 +4018,12 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, uint64_t LocalVisibleOffset = Record[I++]; uint64_t VisibleOffset = LocalVisibleOffset ? BaseOffset + LocalVisibleOffset : 0; + uint64_t LocalModuleLocalOffset = Record[I++]; + uint64_t ModuleLocalOffset = + LocalModuleLocalOffset ? BaseOffset + LocalModuleLocalOffset : 0; - DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset}; + DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset, + ModuleLocalOffset}; assert(!GetExistingDecl(ID) && "We shouldn't load the namespace in the front of delayed " @@ -8366,31 +8435,42 @@ void ASTReader::FindFileRegionDecls(FileID File, *DInfo.Mod, LocalDeclID::get(*this, *DInfo.Mod, *DIt)))); } -bool -ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC, - DeclarationName Name) { +bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC, + DeclarationName Name, + Module *NamedModule) { assert(DC->hasExternalVisibleStorage() && DC == DC->getPrimaryContext() && "DeclContext has no visible decls in storage"); if (!Name) return false; - auto It = Lookups.find(DC); - if (It == Lookups.end()) - return false; - - Deserializing LookupResults(this); - // Load the list of declarations. SmallVector Decls; llvm::SmallPtrSet Found; - for (GlobalDeclID ID : It->second.Table.find(Name)) { - NamedDecl *ND = cast(GetDecl(ID)); - if (ND->getDeclName() == Name && Found.insert(ND).second) - Decls.push_back(ND); + Deserializing LookupResults(this); + + // FIXME: Clear the redundancy with templated lambda in C++20 when that's + // available. + if (auto It = Lookups.find(DC); It != Lookups.end()) { + ++NumVisibleDeclContextsRead; + for (GlobalDeclID ID : It->second.Table.find(Name)) { + NamedDecl *ND = cast(GetDecl(ID)); + if (ND->getDeclName() == Name && Found.insert(ND).second) + Decls.push_back(ND); + } + } + + if (NamedModule) { + if (auto It = ModuleLocalLookups.find(DC); It != ModuleLocalLookups.end()) { + ++NumModuleLocalVisibleDeclContexts; + for (GlobalDeclID ID : It->second.Table.find({Name, NamedModule})) { + NamedDecl *ND = cast(GetDecl(ID)); + if (ND->getDeclName() == Name && Found.insert(ND).second) + Decls.push_back(ND); + } + } } - ++NumVisibleDeclContextsRead; SetExternalVisibleDeclsForName(DC, Name, Decls); return !Decls.empty(); } @@ -8399,18 +8479,25 @@ void ASTReader::completeVisibleDeclsMap(const DeclContext *DC) { if (!DC->hasExternalVisibleStorage()) return; - auto It = Lookups.find(DC); - assert(It != Lookups.end() && - "have external visible storage but no lookup tables"); - DeclsMap Decls; - for (GlobalDeclID ID : It->second.Table.findAll()) { - NamedDecl *ND = cast(GetDecl(ID)); - Decls[ND->getDeclName()].push_back(ND); - } + auto findAll = [&](auto &LookupTables, unsigned &NumRead) { + auto It = LookupTables.find(DC); + if (It == LookupTables.end()) + return; - ++NumVisibleDeclContextsRead; + NumRead++; + + for (GlobalDeclID ID : It->second.Table.findAll()) { + NamedDecl *ND = cast(GetDecl(ID)); + Decls[ND->getDeclName()].push_back(ND); + } + + // FIXME: Why a PCH test is failing if we remove the iterator after findAll? + }; + + findAll(Lookups, NumVisibleDeclContextsRead); + findAll(ModuleLocalLookups, NumModuleLocalVisibleDeclContexts); for (DeclsMap::iterator I = Decls.begin(), E = Decls.end(); I != E; ++I) { SetExternalVisibleDeclsForName(DC, I->first, I->second); @@ -8424,6 +8511,12 @@ ASTReader::getLoadedLookupTables(DeclContext *Primary) const { return I == Lookups.end() ? nullptr : &I->second; } +const serialization::reader::ModuleLocalLookupTable * +ASTReader::getModuleLocalLookupTables(DeclContext *Primary) const { + auto I = ModuleLocalLookups.find(Primary); + return I == ModuleLocalLookups.end() ? nullptr : &I->second; +} + serialization::reader::LazySpecializationInfoLookupTable * ASTReader::getLoadedSpecializationsLookupTables(const Decl *D, bool IsPartial) { assert(D->isCanonicalDecl()); @@ -8533,6 +8626,12 @@ void ASTReader::PrintStats() { NumVisibleDeclContextsRead, TotalVisibleDeclContexts, ((float)NumVisibleDeclContextsRead/TotalVisibleDeclContexts * 100)); + if (TotalModuleLocalVisibleDeclContexts) + std::fprintf( + stderr, " %u/%u module local visible declcontexts read (%f%%)\n", + NumModuleLocalVisibleDeclContexts, TotalModuleLocalVisibleDeclContexts, + ((float)NumModuleLocalVisibleDeclContexts / + TotalModuleLocalVisibleDeclContexts * 100)); if (TotalNumMethodPoolEntries) std::fprintf(stderr, " %u/%u method pool entries read (%f%%)\n", NumMethodPoolEntriesRead, TotalNumMethodPoolEntries, @@ -12639,3 +12738,25 @@ void ASTRecordReader::readOpenACCClauseList( for (unsigned I = 0; I < Clauses.size(); ++I) Clauses[I] = readOpenACCClause(); } + +static unsigned getStableHashForModuleName(StringRef PrimaryModuleName) { + // TODO: Maybe it is better to check PrimaryModuleName is a valid + // module name? + llvm::FoldingSetNodeID ID; + ID.AddString(PrimaryModuleName); + return ID.computeStableHash(); +} + +std::optional clang::getPrimaryModuleHash(const Module *M) { + if (!M) + return std::nullopt; + + if (M->isHeaderLikeModule()) + return std::nullopt; + + if (M->isGlobalModule()) + return std::nullopt; + + StringRef PrimaryModuleName = M->getPrimaryModuleInterfaceName(); + return getStableHashForModuleName(PrimaryModuleName); +} diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 95abd75920c8f..1c51a7b5e460f 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -413,7 +413,8 @@ class ASTDeclReader : public DeclVisitor { void VisitEmptyDecl(EmptyDecl *D); void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D); - std::pair VisitDeclContext(DeclContext *DC); + void VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset, + uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset); template RedeclarableResult VisitRedeclarable(Redeclarable *D); @@ -1855,7 +1856,10 @@ void ASTDeclReader::VisitNamespaceDecl(NamespaceDecl *D) { void ASTDeclReader::VisitHLSLBufferDecl(HLSLBufferDecl *D) { VisitNamedDecl(D); - VisitDeclContext(D); + uint64_t LexicalOffset = 0; + uint64_t VisibleOffset = 0; + uint64_t ModuleLocalOffset = 0; + VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset); D->IsCBuffer = Record.readBool(); D->KwLoc = readSourceLocation(); D->LBraceLoc = readSourceLocation(); @@ -2764,11 +2768,12 @@ void ASTDeclReader::VisitLifetimeExtendedTemporaryDecl( mergeMergeable(D); } -std::pair -ASTDeclReader::VisitDeclContext(DeclContext *DC) { - uint64_t LexicalOffset = ReadLocalOffset(); - uint64_t VisibleOffset = ReadLocalOffset(); - return std::make_pair(LexicalOffset, VisibleOffset); +void ASTDeclReader::VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset, + uint64_t &VisibleOffset, + uint64_t &ModuleLocalOffset) { + LexicalOffset = ReadLocalOffset(); + VisibleOffset = ReadLocalOffset(); + ModuleLocalOffset = ReadLocalOffset(); } template @@ -3869,6 +3874,7 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { switch ((DeclCode)MaybeDeclCode.get()) { case DECL_CONTEXT_LEXICAL: case DECL_CONTEXT_VISIBLE: + case DECL_CONTEXT_MODULE_LOCAL_VISIBLE: case DECL_SPECIALIZATIONS: case DECL_PARTIAL_SPECIALIZATIONS: llvm_unreachable("Record cannot be de-serialized with readDeclRecord"); @@ -4176,21 +4182,35 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { // If this declaration is also a declaration context, get the // offsets for its tables of lexical and visible declarations. if (auto *DC = dyn_cast(D)) { - std::pair Offsets = Reader.VisitDeclContext(DC); + uint64_t LexicalOffset = 0; + uint64_t VisibleOffset = 0; + uint64_t ModuleLocalOffset = 0; + + Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, + ModuleLocalOffset); // Get the lexical and visible block for the delayed namespace. // It is sufficient to judge if ID is in DelayedNamespaceOffsetMap. // But it may be more efficient to filter the other cases. - if (!Offsets.first && !Offsets.second && isa(D)) + if (!LexicalOffset && !VisibleOffset && !ModuleLocalOffset && + isa(D)) if (auto Iter = DelayedNamespaceOffsetMap.find(ID); - Iter != DelayedNamespaceOffsetMap.end()) - Offsets = Iter->second; + Iter != DelayedNamespaceOffsetMap.end()) { + LexicalOffset = Iter->second.LexicalOffset; + VisibleOffset = Iter->second.VisibleOffset; + ModuleLocalOffset = Iter->second.ModuleLocalOffset; + } - if (Offsets.first && - ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, Offsets.first, DC)) + if (LexicalOffset && + ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, LexicalOffset, DC)) + return nullptr; + if (VisibleOffset && + ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, VisibleOffset, ID, + /*IsModuleLocal=*/false)) return nullptr; - if (Offsets.second && - ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, Offsets.second, ID)) + if (ModuleLocalOffset && + ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, ModuleLocalOffset, + ID, /*IsModuleLocal=*/true)) return nullptr; } assert(Record.getIdx() == Record.size()); @@ -4328,8 +4348,8 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) { } // Load the pending visible updates for this decl context, if it has any. - auto I = PendingVisibleUpdates.find(ID); - if (I != PendingVisibleUpdates.end()) { + if (auto I = PendingVisibleUpdates.find(ID); + I != PendingVisibleUpdates.end()) { auto VisibleUpdates = std::move(I->second); PendingVisibleUpdates.erase(I); @@ -4341,6 +4361,21 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) { DC->setHasExternalVisibleStorage(true); } + if (auto I = PendingModuleLocalVisibleUpdates.find(ID); + I != PendingModuleLocalVisibleUpdates.end()) { + auto ModuleLocalVisibleUpdates = std::move(I->second); + PendingModuleLocalVisibleUpdates.erase(I); + + auto *DC = cast(D)->getPrimaryContext(); + for (const auto &Update : ModuleLocalVisibleUpdates) + ModuleLocalLookups[DC].Table.add( + Update.Mod, Update.Data, + reader::ModuleLocalNameLookupTrait(*this, *Update.Mod)); + // NOTE: Can we optimize the case that the data being loaded + // is not related to current module? + DC->setHasExternalVisibleStorage(true); + } + // Load any pending related decls. if (D->isCanonicalDecl()) { if (auto IT = RelatedDeclsMap.find(ID); IT != RelatedDeclsMap.end()) { diff --git a/clang/lib/Serialization/ASTReaderInternals.h b/clang/lib/Serialization/ASTReaderInternals.h index be0d22d1f4094..4be2b2323ec40 100644 --- a/clang/lib/Serialization/ASTReaderInternals.h +++ b/clang/lib/Serialization/ASTReaderInternals.h @@ -31,6 +31,7 @@ class FileEntry; struct HeaderFileInfo; class HeaderSearch; class ObjCMethodDecl; +class Module; namespace serialization { @@ -38,9 +39,8 @@ class ModuleFile; namespace reader { -/// Class that performs name lookup into a DeclContext stored -/// in an AST file. -class ASTDeclContextNameLookupTrait { +class ASTDeclContextNameLookupTraitBase { +protected: ASTReader &Reader; ModuleFile &F; @@ -80,11 +80,37 @@ class ASTDeclContextNameLookupTrait { using offset_type = unsigned; using file_type = ModuleFile *; - using external_key_type = DeclarationName; - using internal_key_type = DeclarationNameKey; +protected: + explicit ASTDeclContextNameLookupTraitBase(ASTReader &Reader, ModuleFile &F) + : Reader(Reader), F(F) {} + +public: + static std::pair + ReadKeyDataLength(const unsigned char *&d); + + void ReadDataIntoImpl(const unsigned char *d, unsigned DataLen, + data_type_builder &Val); + + static void MergeDataInto(const data_type &From, data_type_builder &To) { + To.Data.reserve(To.Data.size() + From.size()); + for (GlobalDeclID ID : From) + To.insert(ID); + } + + file_type ReadFileRef(const unsigned char *&d); + + DeclarationNameKey ReadKeyBase(const unsigned char *&d); +}; +/// Class that performs name lookup into a DeclContext stored +/// in an AST file. +class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +public: explicit ASTDeclContextNameLookupTrait(ASTReader &Reader, ModuleFile &F) - : Reader(Reader), F(F) {} + : ASTDeclContextNameLookupTraitBase(Reader, F) {} + + using external_key_type = DeclarationName; + using internal_key_type = DeclarationNameKey; static bool EqualKey(const internal_key_type &a, const internal_key_type &b) { return a == b; @@ -98,25 +124,39 @@ class ASTDeclContextNameLookupTrait { return Name; } - static std::pair - ReadKeyDataLength(const unsigned char *&d); - internal_key_type ReadKey(const unsigned char *d, unsigned); void ReadDataInto(internal_key_type, const unsigned char *d, unsigned DataLen, data_type_builder &Val); +}; - static void MergeDataInto(const data_type &From, data_type_builder &To) { - To.Data.reserve(To.Data.size() + From.size()); - for (GlobalDeclID ID : From) - To.insert(ID); +struct DeclContextLookupTable { + MultiOnDiskHashTable Table; +}; + +class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +public: + explicit ModuleLocalNameLookupTrait(ASTReader &Reader, ModuleFile &F) + : ASTDeclContextNameLookupTraitBase(Reader, F) {} + + using external_key_type = std::pair; + using internal_key_type = std::pair; + + static bool EqualKey(const internal_key_type &a, const internal_key_type &b) { + return a == b; } - file_type ReadFileRef(const unsigned char *&d); + static hash_value_type ComputeHash(const internal_key_type &Key); + static internal_key_type GetInternalKey(const external_key_type &Key); + + internal_key_type ReadKey(const unsigned char *d, unsigned); + + void ReadDataInto(internal_key_type, const unsigned char *d, unsigned DataLen, + data_type_builder &Val); }; -struct DeclContextLookupTable { - MultiOnDiskHashTable Table; +struct ModuleLocalLookupTable { + MultiOnDiskHashTable Table; }; using LazySpecializationInfo = GlobalDeclID; diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 345d496a93312..a6f8c6009f1ff 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -1088,6 +1088,7 @@ void ASTWriter::WriteBlockInfoBlock() { RECORD(DECL_BLOCK); RECORD(DECL_CONTEXT_LEXICAL); RECORD(DECL_CONTEXT_VISIBLE); + RECORD(DECL_CONTEXT_MODULE_LOCAL_VISIBLE); RECORD(DECL_NAMESPACE); RECORD(DECL_NAMESPACE_ALIAS); RECORD(DECL_USING); @@ -4024,15 +4025,13 @@ void ASTWriter::handleVTable(CXXRecordDecl *RD) { namespace { -// Trait used for the on-disk hash table used in the method pool. -class ASTDeclContextNameLookupTrait { +class ASTDeclContextNameLookupTraitBase { +protected: ASTWriter &Writer; - llvm::SmallVector DeclIDs; + using DeclIDsTy = llvm::SmallVector; + DeclIDsTy DeclIDs; public: - using key_type = DeclarationNameKey; - using key_type_ref = key_type; - /// A start and end index into DeclIDs, representing a sequence of decls. using data_type = std::pair; using data_type_ref = const data_type &; @@ -4040,31 +4039,11 @@ class ASTDeclContextNameLookupTrait { using hash_value_type = unsigned; using offset_type = unsigned; - explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer) : Writer(Writer) {} - - template - data_type getData(const Coll &Decls) { - unsigned Start = DeclIDs.size(); - for (NamedDecl *D : Decls) { - NamedDecl *DeclForLocalLookup = - getDeclForLocalLookup(Writer.getLangOpts(), D); - - if (Writer.getDoneWritingDeclsAndTypes() && - !Writer.wasDeclEmitted(DeclForLocalLookup)) - continue; - - // Try to avoid writing internal decls to reduced BMI. - // See comments in ASTWriter::WriteDeclContextLexicalBlock for details. - if (Writer.isGeneratingReducedBMI() && - !DeclForLocalLookup->isFromExplicitGlobalModule() && - IsInternalDeclFromFileContext(DeclForLocalLookup)) - continue; - - DeclIDs.push_back(Writer.GetDeclRef(DeclForLocalLookup)); - } - return std::make_pair(Start, DeclIDs.size()); - } +protected: + explicit ASTDeclContextNameLookupTraitBase(ASTWriter &Writer) + : Writer(Writer) {} +public: data_type ImportData(const reader::ASTDeclContextNameLookupTrait::data_type &FromReader) { unsigned Start = DeclIDs.size(); DeclIDs.insert( @@ -4074,14 +4053,6 @@ class ASTDeclContextNameLookupTrait { return std::make_pair(Start, DeclIDs.size()); } - static bool EqualKey(key_type_ref a, key_type_ref b) { - return a == b; - } - - hash_value_type ComputeHash(DeclarationNameKey Name) { - return Name.getHash(); - } - void EmitFileRef(raw_ostream &Out, ModuleFile *F) const { assert(Writer.hasChain() && "have reference to loaded module file but no chain?"); @@ -4092,9 +4063,9 @@ class ASTDeclContextNameLookupTrait { llvm::endianness::little); } - std::pair EmitKeyDataLength(raw_ostream &Out, - DeclarationNameKey Name, - data_type_ref Lookup) { + std::pair EmitKeyDataLengthBase(raw_ostream &Out, + DeclarationNameKey Name, + data_type_ref Lookup) { unsigned KeyLen = 1; switch (Name.getKind()) { case DeclarationName::Identifier: @@ -4120,10 +4091,10 @@ class ASTDeclContextNameLookupTrait { // length of DeclIDs. unsigned DataLen = sizeof(DeclID) * (Lookup.second - Lookup.first); - return emitULEBKeyDataLength(KeyLen, DataLen, Out); + return {KeyLen, DataLen}; } - void EmitKey(raw_ostream &Out, DeclarationNameKey Name, unsigned) { + void EmitKeyBase(raw_ostream &Out, DeclarationNameKey Name) { using namespace llvm::support; endian::Writer LE(Out, llvm::endianness::little); @@ -4154,8 +4125,7 @@ class ASTDeclContextNameLookupTrait { llvm_unreachable("Invalid name kind?"); } - void EmitData(raw_ostream &Out, key_type_ref, data_type Lookup, - unsigned DataLen) { + void EmitDataBase(raw_ostream &Out, data_type Lookup, unsigned DataLen) { using namespace llvm::support; endian::Writer LE(Out, llvm::endianness::little); @@ -4166,6 +4136,129 @@ class ASTDeclContextNameLookupTrait { } }; +class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +public: + using primary_module_hash_type = unsigned; + + using key_type = std::pair; + using key_type_ref = key_type; + + explicit ModuleLocalNameLookupTrait(ASTWriter &Writer) + : ASTDeclContextNameLookupTraitBase(Writer) {} + + data_type getData(const DeclIDsTy &LocalIDs) { + unsigned Start = DeclIDs.size(); + for (auto ID : LocalIDs) + DeclIDs.push_back(ID); + return std::make_pair(Start, DeclIDs.size()); + } + + static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; } + + hash_value_type ComputeHash(key_type Key) { + llvm::FoldingSetNodeID ID; + ID.AddInteger(Key.first.getHash()); + ID.AddInteger(Key.second); + return ID.computeStableHash(); + } + + std::pair + EmitKeyDataLength(raw_ostream &Out, key_type Key, data_type_ref Lookup) { + auto [KeyLen, DataLen] = EmitKeyDataLengthBase(Out, Key.first, Lookup); + KeyLen += sizeof(Key.second); + return emitULEBKeyDataLength(KeyLen, DataLen, Out); + } + + void EmitKey(raw_ostream &Out, key_type Key, unsigned) { + EmitKeyBase(Out, Key.first); + llvm::support::endian::Writer LE(Out, llvm::endianness::little); + LE.write(Key.second); + } + + void EmitData(raw_ostream &Out, key_type_ref, data_type Lookup, + unsigned DataLen) { + EmitDataBase(Out, Lookup, DataLen); + } +}; + +// Trait used for the on-disk hash table used in the method pool. +class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +public: + using ModuleLocalDeclsMapTy = + llvm::DenseMap; + +private: + ModuleLocalDeclsMapTy ModuleLocalDeclsMap; + +public: + using key_type = DeclarationNameKey; + using key_type_ref = key_type; + + explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer) + : ASTDeclContextNameLookupTraitBase(Writer) {} + + template data_type getData(const Coll &Decls) { + unsigned Start = DeclIDs.size(); + for (NamedDecl *D : Decls) { + NamedDecl *DeclForLocalLookup = + getDeclForLocalLookup(Writer.getLangOpts(), D); + + if (Writer.getDoneWritingDeclsAndTypes() && + !Writer.wasDeclEmitted(DeclForLocalLookup)) + continue; + + // Try to avoid writing internal decls to reduced BMI. + // See comments in ASTWriter::WriteDeclContextLexicalBlock for details. + if (Writer.isGeneratingReducedBMI() && + !DeclForLocalLookup->isFromExplicitGlobalModule() && + IsInternalDeclFromFileContext(DeclForLocalLookup)) + continue; + + auto ID = Writer.GetDeclRef(DeclForLocalLookup); + + if (D->getFormalLinkage() == Linkage::Module) { + if (std::optional PrimaryModuleHash = + getPrimaryModuleHash(D->getOwningModule())) { + auto Key = std::make_pair(D->getDeclName(), *PrimaryModuleHash); + auto Iter = ModuleLocalDeclsMap.find(Key); + if (Iter == ModuleLocalDeclsMap.end()) + ModuleLocalDeclsMap.insert({Key, DeclIDsTy{ID}}); + else + Iter->second.push_back(ID); + continue; + } + } + + DeclIDs.push_back(ID); + } + return std::make_pair(Start, DeclIDs.size()); + } + + const ModuleLocalDeclsMapTy &getModuleLocalDecls() { + return ModuleLocalDeclsMap; + } + + static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; } + + hash_value_type ComputeHash(key_type Name) { return Name.getHash(); } + + std::pair EmitKeyDataLength(raw_ostream &Out, + DeclarationNameKey Name, + data_type_ref Lookup) { + auto [KeyLen, DataLen] = EmitKeyDataLengthBase(Out, Name, Lookup); + return emitULEBKeyDataLength(KeyLen, DataLen, Out); + } + + void EmitKey(raw_ostream &Out, DeclarationNameKey Name, unsigned) { + return EmitKeyBase(Out, Name); + } + + void EmitData(raw_ostream &Out, key_type_ref, data_type Lookup, + unsigned DataLen) { + EmitDataBase(Out, Lookup, DataLen); + } +}; + } // namespace namespace { @@ -4371,7 +4464,8 @@ static bool isLookupResultNotInteresting(ASTWriter &Writer, void ASTWriter::GenerateNameLookupTable( ASTContext &Context, const DeclContext *ConstDC, - llvm::SmallVectorImpl &LookupTable) { + llvm::SmallVectorImpl &LookupTable, + llvm::SmallVectorImpl &ModuleLocalLookupTable) { assert(!ConstDC->hasLazyLocalLexicalLookups() && !ConstDC->hasLazyExternalLexicalLookups() && "must call buildLookups first"); @@ -4553,6 +4647,28 @@ void ASTWriter::GenerateNameLookupTable( // merged table if there is one. auto *Lookups = Chain ? Chain->getLoadedLookupTables(DC) : nullptr; Generator.emit(LookupTable, Trait, Lookups ? &Lookups->Table : nullptr); + + const auto &ModuleLocalDecls = Trait.getModuleLocalDecls(); + if (ModuleLocalDecls.empty()) + return; + + MultiOnDiskHashTableGenerator + ModuleLocalLookupGenerator; + ModuleLocalNameLookupTrait ModuleLocalTrait(*this); + + for (const auto &ModuleLocalIter : ModuleLocalDecls) { + const auto &Key = ModuleLocalIter.first; + const auto &IDs = ModuleLocalIter.second; + ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs), + ModuleLocalTrait); + } + + auto *ModuleLocalLookups = + Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr; + ModuleLocalLookupGenerator.emit( + ModuleLocalLookupTable, ModuleLocalTrait, + ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr); } /// Write the block containing all of the declaration IDs @@ -4560,8 +4676,10 @@ void ASTWriter::GenerateNameLookupTable( /// /// \returns the offset of the DECL_CONTEXT_VISIBLE block within the /// bitstream, or 0 if no block was written. -uint64_t ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, - DeclContext *DC) { +void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, + DeclContext *DC, + uint64_t &VisibleBlockOffset, + uint64_t &ModuleLocalBlockOffset) { // If we imported a key declaration of this namespace, write the visible // lookup results as an update record for it rather than including them // on this declaration. We will only look at key declarations on reload. @@ -4571,7 +4689,7 @@ uint64_t ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, for (auto *Prev = cast(DC)->getPreviousDecl(); Prev; Prev = Prev->getPreviousDecl()) if (!Prev->isFromASTFile()) - return 0; + return; // Note that we need to emit an update record for the primary context. UpdatedDeclContexts.insert(DC->getPrimaryContext()); @@ -4620,41 +4738,53 @@ uint64_t ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, } } - return 0; + return; } if (DC->getPrimaryContext() != DC) - return 0; + return; // Skip contexts which don't support name lookup. if (!DC->isLookupContext()) - return 0; + return; // If not in C++, we perform name lookup for the translation unit via the // IdentifierInfo chains, don't bother to build a visible-declarations table. if (DC->isTranslationUnit() && !Context.getLangOpts().CPlusPlus) - return 0; + return; // Serialize the contents of the mapping used for lookup. Note that, // although we have two very different code paths, the serialized // representation is the same for both cases: a declaration name, // followed by a size, followed by references to the visible // declarations that have that name. - uint64_t Offset = Stream.GetCurrentBitNo(); StoredDeclsMap *Map = DC->buildLookup(); if (!Map || Map->empty()) - return 0; + return; + VisibleBlockOffset = Stream.GetCurrentBitNo(); // Create the on-disk hash table in a buffer. SmallString<4096> LookupTable; - GenerateNameLookupTable(Context, DC, LookupTable); + SmallString<4096> ModuleLocalLookupTable; + GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable); // Write the lookup table RecordData::value_type Record[] = {DECL_CONTEXT_VISIBLE}; Stream.EmitRecordWithBlob(DeclContextVisibleLookupAbbrev, Record, LookupTable); ++NumVisibleDeclContexts; - return Offset; + + if (ModuleLocalLookupTable.empty()) + return; + + ModuleLocalBlockOffset = Stream.GetCurrentBitNo(); + assert(ModuleLocalBlockOffset > VisibleBlockOffset); + // Write the lookup table + RecordData::value_type ModuleLocalRecord[] = { + DECL_CONTEXT_MODULE_LOCAL_VISIBLE}; + Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev, + ModuleLocalRecord, ModuleLocalLookupTable); + ++NumModuleLocalDeclContexts; } /// Write an UPDATE_VISIBLE block for the given context. @@ -4671,7 +4801,8 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context, // Create the on-disk hash table in a buffer. SmallString<4096> LookupTable; - GenerateNameLookupTable(Context, DC, LookupTable); + SmallString<4096> ModuleLocalLookupTable; + GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable); // If we're updating a namespace, select a key declaration as the key for the // update record; those are the only ones that will be checked on reload. @@ -4682,6 +4813,15 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context, RecordData::value_type Record[] = {UPDATE_VISIBLE, getDeclID(cast(DC)).getRawValue()}; Stream.EmitRecordWithBlob(UpdateVisibleAbbrev, Record, LookupTable); + + if (ModuleLocalLookupTable.empty()) + return; + + // Write the module local lookup table + RecordData::value_type ModuleLocalRecord[] = { + UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast(DC)).getRawValue()}; + Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord, + ModuleLocalLookupTable); } /// Write an FP_PRAGMA_OPTIONS block for the given FPOptions. @@ -5865,7 +6005,8 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema *SemaPtr, StringRef isysroot, // Some simple statistics RecordData::value_type Record[] = { - NumStatements, NumMacros, NumLexicalDeclContexts, NumVisibleDeclContexts}; + NumStatements, NumMacros, NumLexicalDeclContexts, NumVisibleDeclContexts, + NumModuleLocalDeclContexts}; Stream.EmitRecord(STATISTICS, Record); Stream.ExitBlock(); Stream.FlushToWord(); @@ -5942,7 +6083,9 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { RecordData DelayedNamespaceRecord; for (NamespaceDecl *NS : DelayedNamespace) { uint64_t LexicalOffset = WriteDeclContextLexicalBlock(Context, NS); - uint64_t VisibleOffset = WriteDeclContextVisibleBlock(Context, NS); + uint64_t VisibleOffset = 0; + uint64_t ModuleLocalOffset = 0; + WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset); // Write the offset relative to current block. if (LexicalOffset) @@ -5951,9 +6094,13 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { if (VisibleOffset) VisibleOffset -= DeclTypesBlockStartOffset; + if (ModuleLocalOffset) + ModuleLocalOffset -= DeclTypesBlockStartOffset; + AddDeclRef(NS, DelayedNamespaceRecord); DelayedNamespaceRecord.push_back(LexicalOffset); DelayedNamespaceRecord.push_back(VisibleOffset); + DelayedNamespaceRecord.push_back(ModuleLocalOffset); } // The process of writing lexical and visible block for delayed namespace @@ -6033,6 +6180,12 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob)); UpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); + Abv->Add(llvm::BitCodeAbbrevOp(UPDATE_MODULE_LOCAL_VISIBLE)); + Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6)); + Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob)); + ModuleLocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv)); + // And a visible updates block for the translation unit. WriteDeclContextVisibleUpdate(Context, TU); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 3b357f3c50dad..7a494cfe1ac64 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2068,6 +2068,7 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) { uint64_t LexicalOffset = 0; uint64_t VisibleOffset = 0; + uint64_t ModuleLocalOffset = 0; if (Writer.isGeneratingReducedBMI() && isa(DC) && cast(DC)->isFromExplicitGlobalModule()) { @@ -2078,12 +2079,13 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) { } else { LexicalOffset = Writer.WriteDeclContextLexicalBlock(Record.getASTContext(), DC); - VisibleOffset = - Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC); + Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC, + VisibleOffset, ModuleLocalOffset); } Record.AddOffset(LexicalOffset); Record.AddOffset(VisibleOffset); + Record.AddOffset(ModuleLocalOffset); } const Decl *ASTWriter::getFirstLocalDecl(const Decl *D) { @@ -2438,6 +2440,7 @@ void ASTWriter::WriteDeclAbbrevs() { // DC Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LexicalOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // VisibleOffset + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ModuleLocalOffset DeclEnumAbbrev = Stream.EmitAbbrev(std::move(Abv)); // Abbreviation for DECL_RECORD @@ -2490,6 +2493,7 @@ void ASTWriter::WriteDeclAbbrevs() { // DC Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LexicalOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // VisibleOffset + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ModuleLocalOffset DeclRecordAbbrev = Stream.EmitAbbrev(std::move(Abv)); // Abbreviation for DECL_PARM_VAR @@ -2827,6 +2831,11 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); DeclContextVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); + Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_MODULE_LOCAL_VISIBLE)); + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); + DeclModuleLocalVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); Abv->Add(BitCodeAbbrevOp(serialization::DECL_SPECIALIZATIONS)); Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); diff --git a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp index c4af02f21f494..55bcb6e220e1e 100644 --- a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp @@ -226,7 +226,7 @@ void ExplodedNode::NodeGroup::addNode(ExplodedNode *N, ExplodedGraph &G) { return; } - ExplodedNodeVector *V = Storage.dyn_cast(); + ExplodedNodeVector *V = dyn_cast(Storage); if (!V) { // Switch from single-node to multi-node representation. @@ -251,7 +251,7 @@ unsigned ExplodedNode::NodeGroup::size() const { const GroupStorage &Storage = reinterpret_cast(P); if (Storage.isNull()) return 0; - if (ExplodedNodeVector *V = Storage.dyn_cast()) + if (ExplodedNodeVector *V = dyn_cast(Storage)) return V->size(); return 1; } @@ -263,7 +263,7 @@ ExplodedNode * const *ExplodedNode::NodeGroup::begin() const { const GroupStorage &Storage = reinterpret_cast(P); if (Storage.isNull()) return nullptr; - if (ExplodedNodeVector *V = Storage.dyn_cast()) + if (ExplodedNodeVector *V = dyn_cast(Storage)) return V->begin(); return Storage.getAddrOfPtr1(); } @@ -275,7 +275,7 @@ ExplodedNode * const *ExplodedNode::NodeGroup::end() const { const GroupStorage &Storage = reinterpret_cast(P); if (Storage.isNull()) return nullptr; - if (ExplodedNodeVector *V = Storage.dyn_cast()) + if (ExplodedNodeVector *V = dyn_cast(Storage)) return V->end(); return Storage.getAddrOfPtr1() + 1; } diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index 88b7349ce8fed..03523c3f17eda 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -692,11 +692,12 @@ std::unique_ptr buildASTFromCodeWithArgs( StringRef Code, const std::vector &Args, StringRef FileName, StringRef ToolName, std::shared_ptr PCHContainerOps, ArgumentsAdjuster Adjuster, const FileContentMappings &VirtualMappedFiles, - DiagnosticConsumer *DiagConsumer) { + DiagnosticConsumer *DiagConsumer, + IntrusiveRefCntPtr BaseFS) { std::vector> ASTs; ASTBuilderAction Action(ASTs); llvm::IntrusiveRefCntPtr OverlayFileSystem( - new llvm::vfs::OverlayFileSystem(llvm::vfs::getRealFileSystem())); + new llvm::vfs::OverlayFileSystem(std::move(BaseFS))); llvm::IntrusiveRefCntPtr InMemoryFileSystem( new llvm::vfs::InMemoryFileSystem); OverlayFileSystem->pushOverlay(InMemoryFileSystem); diff --git a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp index d69db40062dae..54ec6aa61ec37 100644 --- a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp +++ b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp @@ -62,8 +62,8 @@ void test_late() { not_exported = 1; #ifndef IMPLEMENTATION - // expected-error@-2 {{declaration of 'not_exported' must be imported from module 'A' before it is required}} - // expected-note@p2.cpp:19 {{declaration here is not visible}} + // expected-error@-2 {{use of undeclared identifier 'not_exported'; did you mean 'exported'?}} + // expected-note@p2.cpp:18 {{'exported' declared here}} #endif internal = 1; diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm index 19761fb3359ce..5a497304201dc 100644 --- a/clang/test/CXX/module/basic/basic.link/p2.cppm +++ b/clang/test/CXX/module/basic/basic.link/p2.cppm @@ -62,12 +62,11 @@ import M; void use_from_module_impl() { external_linkage_fn(); - module_linkage_fn(); // expected-error {{declaration of 'module_linkage_fn' must be imported}} + module_linkage_fn(); // expected-error {{use of undeclared identifier 'module_linkage_fn'}} internal_linkage_fn(); // expected-error {{declaration of 'internal_linkage_fn' must be imported}} (void)external_linkage_class{}; (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} - // expected-note@M.cppm:9 {{declaration here is not visible}} // expected-note@M.cppm:10 {{declaration here is not visible}} (void)external_linkage_var; (void)module_linkage_var; // expected-error {{undeclared identifier}} diff --git a/clang/test/CXX/module/module.import/p2.cpp b/clang/test/CXX/module/module.import/p2.cpp index 6b8e32f746b62..0ad3bc815beac 100644 --- a/clang/test/CXX/module/module.import/p2.cpp +++ b/clang/test/CXX/module/module.import/p2.cpp @@ -23,10 +23,7 @@ export A f(); //--- Use.cpp import M; void test() { - A a; // expected-error {{definition of 'A' must be imported from module 'M' before it is required}} - // expected-error@-1 {{definition of 'A' must be imported from module 'M' before it is required}} expected-error@-1 {{}} - // expected-note@impl.cppm:2 {{declaration here is not visible}} - // expected-note@impl.cppm:2 {{definition here is not reachable}} expected-note@impl.cppm:2 {{}} + A a; // expected-error {{unknown type name 'A'}} } //--- UseInPartA.cppm @@ -40,10 +37,7 @@ void test() { export module B; import M; void test() { - A a; // expected-error {{declaration of 'A' must be imported from module 'M'}} - // expected-error@-1 {{definition of 'A' must be imported from module 'M'}} expected-error@-1 {{}} - // expected-note@impl.cppm:2 {{declaration here is not visible}} - // expected-note@impl.cppm:2 {{definition here is not reachable}} expected-note@impl.cppm:2 {{}} + A a; // expected-error {{unknown type name 'A'}} } //--- Private.cppm diff --git a/clang/test/CXX/module/module.interface/p7.cpp b/clang/test/CXX/module/module.interface/p7.cpp index 1572390f0d289..cff5df91e43d4 100644 --- a/clang/test/CXX/module/module.interface/p7.cpp +++ b/clang/test/CXX/module/module.interface/p7.cpp @@ -57,12 +57,10 @@ void test() { void test2() { auto a = E1::e1; // OK, namespace-scope name E1 is visible and e1 is reachable auto b = e1; // OK, namespace-scope name e1 is visible - auto c = E2::e2; // expected-error {{declaration of 'E2' must be imported from module}} - // expected-note@* {{declaration here is not visible}} - auto d = e2; // should be error, namespace-scope name e2 is not visible + auto c = E2::e2; // expected-error {{use of undeclared identifier 'E2'}} + auto d = e2; // expected-error {{use of undeclared identifier 'e2'}} auto e = E2U::e2; // OK, namespace-scope name E2U is visible and E2::e2 is reachable - auto f = E3::e3; // expected-error {{declaration of 'E3' must be imported from module 'p7' before it is required}} - // expected-note@* {{declaration here is not visible}} - auto g = e3; // should be error, namespace-scope name e3 is not visible + auto f = E3::e3; // expected-error {{use of undeclared identifier 'E3'}} + auto g = e3; // expected-error {{use of undeclared identifier 'e3'}} auto h = decltype(func())::e3; // OK, namespace-scope name f is visible and E3::e3 is reachable } diff --git a/clang/test/CXX/module/module.reach/p5.cpp b/clang/test/CXX/module/module.reach/p5.cpp index 9c498a260530f..947fd082553ec 100644 --- a/clang/test/CXX/module/module.reach/p5.cpp +++ b/clang/test/CXX/module/module.reach/p5.cpp @@ -14,5 +14,4 @@ export using Y = X; export module B; import A; Y y; // OK, definition of X is reachable -X x; // expected-error {{declaration of 'X' must be imported from module 'A' before it is required}} - // expected-note@* {{declaration here is not visible}} +X x; // expected-error {{unknown type name 'X'}} diff --git a/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml new file mode 100644 index 0000000000000..fe6a9a8d7f1ee --- /dev/null +++ b/clang/test/Driver/baremetal-multilib-custom-flags-parsing.yaml @@ -0,0 +1,133 @@ +# RUN: split-file %s %t + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-without-macro-defines.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/multilib-with-macro-defines.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s +# CHECK-NOT: error: + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-name.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-NAME +# CHECK-MISSING-FLAG-NAME: error: custom flag requires a name + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-values.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUES +# CHECK-MISSING-FLAG-VALUES: error: custom flag must have at least one value + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-default.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-DEFAULT +# CHECK-MISSING-FLAG-VALUE-DEFAULT: error: custom flag must have a default value + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/missing-flag-value-name.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-MISSING-FLAG-VALUE-NAME +# CHECK-MISSING-FLAG-VALUE-NAME: error: custom flag value requires a name + +# RUN: %clang --target=arm-none-eabi --multi-lib-config=%t/duplicate-flag-value-name.yaml %s -### -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-DUPLICATE-FLAG-VALUE-NAME +# CHECK-DUPLICATE-FLAG-VALUE-NAME: error: duplicate custom flag value name: "value-name" +# CHECK-DUPLICATE-FLAG-VALUE-NAME-NEXT: - Name: value-name + +#--- multilib-without-macro-defines.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: a + - Name: b + Default: a + +#--- multilib-with-macro-defines.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: a + MacroDefines: [FEATURE_A] + - Name: b + MacroDefines: [FEATURE_B] + Default: a + +#--- missing-flag-name.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Values: + - Name: a + Default: a + +#--- missing-flag-values.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + Default: a + +#--- missing-flag-value-default.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: a + Default: + +#--- missing-flag-value-name.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=a] + +Flags: + - Name: flag + Values: + - Name: + Default: a + +#--- duplicate-flag-value-name.yaml +--- +MultilibVersion: 1.0 + +Variants: +- Dir: libc + Flags: [-fmultilib-flag=value-name] + +Flags: + - Name: a + Values: + - Name: value-name + - Name: value-a + Default: value-name + - Name: b + Values: + - Name: value-name + Default: value-name diff --git a/clang/test/Driver/hip-cuid.hip b/clang/test/Driver/hip-cuid.hip index 2e38c59ccf5ef..78c391c966e2a 100644 --- a/clang/test/Driver/hip-cuid.hip +++ b/clang/test/Driver/hip-cuid.hip @@ -80,16 +80,37 @@ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck -check-prefixes=DEVICE %s +// Check cuid is supported by the new driver. +// RUN: %clang -### -x hip \ +// RUN: --target=x86_64-unknown-linux-gnu \ +// RUN: --no-offload-new-driver \ +// RUN: --offload-arch=gfx900 \ +// RUN: --offload-arch=gfx906 \ +// RUN: -c -nogpuinc -nogpulib --offload-new-driver \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck -check-prefixes=COMMON,HEX %s + +// Check cuid is supported by CUDA by the default new driver. +// RUN: %clang -### -x cu \ +// RUN: --target=x86_64-unknown-linux-gnu \ +// RUN: --offload-arch=sm_60 \ +// RUN: --offload-arch=sm_70 \ +// RUN: -c -nogpuinc -nogpulib \ +// RUN: %S/Inputs/hip_multiple_inputs/a.cu \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck -check-prefixes=COMMON,HEX %s + // INVALID: invalid value 'invalid' in '-fuse-cuid=invalid' -// COMMON: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" -// COMMON-SAME: "-target-cpu" "gfx900" +// COMMON: "-cc1"{{.*}} "-triple" "[[TRIP:(amdgcn-amd-amdhsa|nvptx64-nvidia-cuda)]]" +// COMMON-SAME: "-target-cpu" "[[G1:(gfx900|sm_60)]]" // HEX-SAME: "-cuid=[[CUID:[0-9a-f]+]]" // FIXED-SAME: "-cuid=[[CUID:xyz_123]]" // COMMON-SAME: "{{.*}}a.cu" -// COMMON: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" -// COMMON-SAME: "-target-cpu" "gfx906" +// COMMON: "-cc1"{{.*}} "-triple" "[[TRIP]]" +// COMMON-SAME: "-target-cpu" "[[G2:(gfx906|sm_70)]]" // COMMON-SAME: "-cuid=[[CUID]]" // COMMON-SAME: "{{.*}}a.cu" @@ -97,15 +118,15 @@ // COMMON-SAME: "-cuid=[[CUID]]" // COMMON-SAME: "{{.*}}a.cu" -// COMMON: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" -// COMMON-SAME: "-target-cpu" "gfx900" +// COMMON: "-cc1"{{.*}} "-triple" "[[TRIP]]" +// COMMON-SAME: "-target-cpu" "[[G1]]" // HEX-NOT: "-cuid=[[CUID]]" // HEX-SAME: "-cuid=[[CUID2:[0-9a-f]+]]" // FIXED-SAME: "-cuid=[[CUID2:xyz_123]]" // COMMON-SAME: "{{.*}}b.hip" -// COMMON: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" -// COMMON-SAME: "-target-cpu" "gfx906" +// COMMON: "-cc1"{{.*}} "-triple" "[[TRIP]]" +// COMMON-SAME: "-target-cpu" "[[G2]]" // HEX-NOT: "-cuid=[[CUID]]" // COMMON-SAME: "-cuid=[[CUID2]]" // COMMON-SAME: "{{.*}}b.hip" diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp index 8d17fe1549e34..7606713bfa22a 100644 --- a/clang/test/Driver/modules-print-library-module-manifest-path.cpp +++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp @@ -48,6 +48,9 @@ // RUN: --target=x86_64-linux-gnu 2>&1 \ // RUN: | FileCheck libcxx-no-shared-lib.cpp +// Testing with libstdc++ +// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libstdc++.so +// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libstdc++.modules.json // RUN: %clang -print-library-module-manifest-path \ // RUN: -stdlib=libstdc++ \ // RUN: -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \ @@ -74,4 +77,4 @@ //--- libstdcxx.cpp -// CHECK: +// CHECK: {{.*}}libstdc++.modules.json \ No newline at end of file diff --git a/clang/test/Modules/Reachability-template-default-arg.cpp b/clang/test/Modules/Reachability-template-default-arg.cpp index 35c647d0d344b..a7da86b8cc2d5 100644 --- a/clang/test/Modules/Reachability-template-default-arg.cpp +++ b/clang/test/Modules/Reachability-template-default-arg.cpp @@ -21,6 +21,5 @@ struct A { import template_default_arg; void bar() { A<> a0; - A a1; // expected-error {{declaration of 't' must be imported from module 'template_default_arg' before it is required}} - // expected-note@* {{declaration here is not visible}} + A a1; // expected-error {{use of undeclared identifier 't'}} } diff --git a/clang/test/Modules/cxx20-10-1-ex2.cpp b/clang/test/Modules/cxx20-10-1-ex2.cpp index fc61d89926d44..8611d6d64c851 100644 --- a/clang/test/Modules/cxx20-10-1-ex2.cpp +++ b/clang/test/Modules/cxx20-10-1-ex2.cpp @@ -78,8 +78,7 @@ int &c = n; // OK //--- std10-1-ex2-tu6.cpp import B; // error, n is module-local and this is not a module. -int &c = n; // expected-error {{declaration of 'n' must be imported}} - // expected-note@* {{declaration here is not visible}} +int &c = n; // expected-error {{use of undeclared identifier 'n'}} //--- std10-1-ex2-tu7.cpp // expected-no-diagnostics diff --git a/clang/test/Modules/deduction-guide3.cppm b/clang/test/Modules/deduction-guide3.cppm index 1165dd40bcfb8..f7990004cec7c 100644 --- a/clang/test/Modules/deduction-guide3.cppm +++ b/clang/test/Modules/deduction-guide3.cppm @@ -22,8 +22,6 @@ Templ(T t) -> Templ; //--- Use.cpp import Templ; void func() { - Templ t(5); // expected-error {{declaration of 'Templ' must be imported from module 'Templ' before it is required}} - // expected-error@-1 {{unknown type name 'Templ'}} - // expected-note@Templ.cppm:3 {{declaration here is not visible}} + Templ t(5); // expected-error {{unknown type name 'Templ'}} } diff --git a/clang/test/Modules/module-local-with-templates.cppm b/clang/test/Modules/module-local-with-templates.cppm new file mode 100644 index 0000000000000..87955bdd3f99e --- /dev/null +++ b/clang/test/Modules/module-local-with-templates.cppm @@ -0,0 +1,79 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// RUN: %clang_cc1 -std=c++20 %t/a-part.cppm -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// +// Test again with reduced BMI +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// RUN: %clang_cc1 -std=c++20 %t/a-part.cppm -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// RUN: %clang_cc1 -std=c++20 %t/a.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify + + +//--- a.cppm +export module a; + +constexpr int x = 43; + +export constexpr int f() { return x; } + +export template +constexpr T g() { + return x; +} + +namespace nn { + +constexpr int x = 88; + +export constexpr int f() { return x; } + +export template +constexpr T g() { + return x; +} +} + +//--- use.cc +// expected-no-diagnostics +import a; + +static_assert(f() == 43, ""); + +constexpr int x = 99; + +static_assert(g() == 43, ""); + +static_assert(x == 99, ""); + +namespace nn { +static_assert(f() == 88, ""); + +constexpr int x = 1000; + +static_assert(g() == 88, ""); + +static_assert(x == 1000, ""); + +} + +//--- a-part.cppm +module a:impl; +import a; + +static_assert(x == 43, ""); + +constexpr int x = 1000; // expected-error {{redefinition of 'x'}} + // expected-note@* {{previous definition is here}} + +//--- a.cc +module a; + +static_assert(x == 43, ""); + +constexpr int x = 1000; // expected-error {{redefinition of 'x'}} + // expected-note@* {{previous definition is here}} + diff --git a/clang/test/Modules/pr90154.cppm b/clang/test/Modules/pr90154.cppm new file mode 100644 index 0000000000000..d626646fbc488 --- /dev/null +++ b/clang/test/Modules/pr90154.cppm @@ -0,0 +1,25 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify +// +// Test again with reduced BMI +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cc -fmodule-file=a=%t/a.pcm -fsyntax-only -verify + +//--- a.cppm +export module a; +int b = 99; +namespace a { int a = 43; } + +//--- use.cc +// expected-no-diagnostics +import a; + +namespace a { + double a = 43.0; +} + +int b = 883; diff --git a/clang/test/Preprocessor/arm-acle-6.4.c b/clang/test/Preprocessor/arm-acle-6.4.c index fcabe028b9559..2c8f4868263a6 100644 --- a/clang/test/Preprocessor/arm-acle-6.4.c +++ b/clang/test/Preprocessor/arm-acle-6.4.c @@ -93,6 +93,10 @@ // CHECK-V6K: __ARM_FEATURE_LDREX 0xf +// RUN: %clang -target arm-none-linux-eabi -march=armv6kz -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V6KZ + +// CHECK-V6KZ: __ARM_FEATURE_LDREX 0xf + // RUN: %clang -target arm-none-linux-eabi -march=armv7-a -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V7A // CHECK-V7A: __ARM_ARCH 7 diff --git a/clang/test/SemaCXX/ptrauth-type-discriminator.cpp b/clang/test/SemaCXX/ptrauth-type-discriminator.cpp index 685ca1f03fddd..f5b71ed86acf7 100644 --- a/clang/test/SemaCXX/ptrauth-type-discriminator.cpp +++ b/clang/test/SemaCXX/ptrauth-type-discriminator.cpp @@ -1,6 +1,9 @@ // RUN: %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics %s // RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics %s +// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics -fexperimental-new-constant-interpreter %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++17 -Wno-vla -fsyntax-only -verify -fptrauth-intrinsics -fexperimental-new-constant-interpreter %s + // RUN: not %clang_cc1 -triple arm64-apple-ios -std=c++17 -Wno-vla -fsyntax-only %s 2>&1 | FileCheck %s // CHECK: this target does not support pointer authentication diff --git a/clang/test/SemaCXX/warn-inconsistent-missing-destructor-override b/clang/test/SemaCXX/warn-inconsistent-missing-destructor-override.cpp similarity index 100% rename from clang/test/SemaCXX/warn-inconsistent-missing-destructor-override rename to clang/test/SemaCXX/warn-inconsistent-missing-destructor-override.cpp diff --git a/clang/test/SemaCXX/warn-suggest-destructor-override b/clang/test/SemaCXX/warn-suggest-destructor-override.cpp similarity index 100% rename from clang/test/SemaCXX/warn-suggest-destructor-override rename to clang/test/SemaCXX/warn-suggest-destructor-override.cpp diff --git a/clang/test/SemaCXX/warn-suggest-override b/clang/test/SemaCXX/warn-suggest-override.cpp similarity index 58% rename from clang/test/SemaCXX/warn-suggest-override rename to clang/test/SemaCXX/warn-suggest-override.cpp index e06c939ff001f..c4b5149c681a4 100644 --- a/clang/test/SemaCXX/warn-suggest-override +++ b/clang/test/SemaCXX/warn-suggest-override.cpp @@ -17,13 +17,13 @@ struct C { struct D : public C { void run(); - // expected-warning@-1 {{'run()' overrides a member function but is not marked 'override'}} + // expected-warning@-1 {{'run' overrides a member function but is not marked 'override'}} ~D(); }; struct E : public C { virtual void run(); - // expected-warning@-1 {{'run()' overrides a member function but is not marked 'override'}} + // expected-warning@-1 {{'run' overrides a member function but is not marked 'override'}} virtual ~E(); }; @@ -32,7 +32,8 @@ struct F : public C { ~F() override; }; -struct G : public C { +struct G : public C { // expected-note {{mark 'G' as 'final'}} void run() final; ~G() final; + // expected-warning@-1 {{class with destructor marked 'final' cannot be inherited from}} }; diff --git a/clang/tools/libclang/CIndexCXX.cpp b/clang/tools/libclang/CIndexCXX.cpp index a1be70dde9f67..8b84fdc22ecff 100644 --- a/clang/tools/libclang/CIndexCXX.cpp +++ b/clang/tools/libclang/CIndexCXX.cpp @@ -27,6 +27,33 @@ unsigned clang_isVirtualBase(CXCursor C) { return B->isVirtual(); } +unsigned clang_visitCXXBaseClasses(CXType PT, CXFieldVisitor visitor, + CXClientData client_data) { + CXCursor PC = clang_getTypeDeclaration(PT); + if (clang_isInvalid(PC.kind)) + return false; + const CXXRecordDecl *RD = + dyn_cast_if_present(cxcursor::getCursorDecl(PC)); + if (!RD || RD->isInvalidDecl()) + return false; + RD = RD->getDefinition(); + if (!RD || RD->isInvalidDecl()) + return false; + + for (auto &Base : RD->bases()) { + // Callback to the client. + switch ( + visitor(cxcursor::MakeCursorCXXBaseSpecifier(&Base, getCursorTU(PC)), + client_data)) { + case CXVisit_Break: + return true; + case CXVisit_Continue: + break; + } + } + return true; +} + enum CX_CXXAccessSpecifier clang_getCXXAccessSpecifier(CXCursor C) { AccessSpecifier spec = AS_none; diff --git a/clang/tools/libclang/CXType.cpp b/clang/tools/libclang/CXType.cpp index f1b661435c499..5da87c6f4aa9c 100644 --- a/clang/tools/libclang/CXType.cpp +++ b/clang/tools/libclang/CXType.cpp @@ -19,6 +19,7 @@ #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/Expr.h" +#include "clang/AST/RecordLayout.h" #include "clang/AST/Type.h" #include "clang/Basic/AddressSpaces.h" #include "clang/Frontend/ASTUnit.h" @@ -1108,6 +1109,39 @@ long long clang_Cursor_getOffsetOfField(CXCursor C) { return -1; } +long long clang_getOffsetOfBase(CXCursor Parent, CXCursor Base) { + if (Base.kind != CXCursor_CXXBaseSpecifier) + return -1; + + if (!clang_isDeclaration(Parent.kind)) + return -1; + + // we need to validate the parent type + CXType PT = clang_getCursorType(Parent); + long long Error = validateFieldParentType(Parent, PT); + if (Error < 0) + return Error; + + const CXXRecordDecl *ParentRD = + dyn_cast(cxcursor::getCursorDecl(Parent)); + if (!ParentRD) + return -1; + + ASTContext &Ctx = cxcursor::getCursorContext(Base); + const CXXBaseSpecifier *B = cxcursor::getCursorCXXBaseSpecifier(Base); + if (ParentRD->bases_begin() > B || ParentRD->bases_end() <= B) + return -1; + + const CXXRecordDecl *BaseRD = B->getType()->getAsCXXRecordDecl(); + if (!BaseRD) + return -1; + + const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(ParentRD); + if (B->isVirtual()) + return Ctx.toBits(Layout.getVBaseClassOffset(BaseRD)); + return Ctx.toBits(Layout.getBaseClassOffset(BaseRD)); +} + enum CXRefQualifierKind clang_Type_getCXXRefQualifier(CXType T) { QualType QT = GetQualType(T); if (QT.isNull()) diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map index 00ba56ab3c79d..8ca8a58b76d9e 100644 --- a/clang/tools/libclang/libclang.map +++ b/clang/tools/libclang/libclang.map @@ -436,8 +436,10 @@ LLVM_19 { LLVM_20 { global: + clang_getOffsetOfBase; clang_getTypePrettyPrinted; clang_isBeforeInTranslationUnit; + clang_visitCXXBaseClasses; }; # Example of how to add a new symbol version entry. If you do add a new symbol diff --git a/clang/unittests/AST/ExternalASTSourceTest.cpp b/clang/unittests/AST/ExternalASTSourceTest.cpp index 8e1bde1247f66..ad209604971f4 100644 --- a/clang/unittests/AST/ExternalASTSourceTest.cpp +++ b/clang/unittests/AST/ExternalASTSourceTest.cpp @@ -68,7 +68,8 @@ TEST(ExternalASTSourceTest, FailedLookupOccursOnce) { TestSource(unsigned &Calls) : Calls(Calls) {} bool FindExternalVisibleDeclsByName(const DeclContext *, - DeclarationName Name) override { + DeclarationName Name, + Module *NamedModule) override { if (Name.getAsString() == "j") ++Calls; return false; diff --git a/clang/unittests/Tooling/ToolingTest.cpp b/clang/unittests/Tooling/ToolingTest.cpp index 0b65577a05193..8cdfffb54390e 100644 --- a/clang/unittests/Tooling/ToolingTest.cpp +++ b/clang/unittests/Tooling/ToolingTest.cpp @@ -152,6 +152,20 @@ TEST(buildASTFromCode, ReportsErrors) { EXPECT_EQ(1u, Consumer.NumDiagnosticsSeen); } +TEST(buildASTFromCode, FileSystem) { + llvm::IntrusiveRefCntPtr InMemoryFileSystem( + new llvm::vfs::InMemoryFileSystem); + InMemoryFileSystem->addFile("included_file.h", 0, + llvm::MemoryBuffer::getMemBufferCopy("class X;")); + std::unique_ptr AST = buildASTFromCodeWithArgs( + R"(#include "included_file.h")", {}, "input.cc", "clang-tool", + std::make_shared(), + getClangStripDependencyFileAdjuster(), FileContentMappings(), nullptr, + InMemoryFileSystem); + ASSERT_TRUE(AST.get()); + EXPECT_TRUE(FindClassDeclX(AST.get())); +} + TEST(newFrontendActionFactory, CreatesFrontendActionFactoryFromType) { std::unique_ptr Factory( newFrontendActionFactory()); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp index b25b45e021744..4940062eeae47 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp @@ -123,6 +123,7 @@ unsigned pid_t_sz = sizeof(pid_t); unsigned timeval_sz = sizeof(timeval); unsigned uid_t_sz = sizeof(uid_t); unsigned gid_t_sz = sizeof(gid_t); +unsigned fpos_t_sz = sizeof(fpos_t); unsigned mbstate_t_sz = sizeof(mbstate_t); unsigned sigset_t_sz = sizeof(sigset_t); unsigned struct_timezone_sz = sizeof(struct timezone); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h index 3942f1523437f..8ce73f206fd88 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h @@ -46,6 +46,7 @@ extern unsigned pid_t_sz; extern unsigned timeval_sz; extern unsigned uid_t_sz; extern unsigned gid_t_sz; +extern unsigned fpos_t_sz; extern unsigned mbstate_t_sz; extern unsigned struct_timezone_sz; extern unsigned struct_tms_sz; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp index ecabbf0d08e2c..aacd28c55ceaa 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp @@ -547,6 +547,7 @@ unsigned pid_t_sz = sizeof(pid_t); unsigned timeval_sz = sizeof(timeval); unsigned uid_t_sz = sizeof(uid_t); unsigned gid_t_sz = sizeof(gid_t); +unsigned fpos_t_sz = sizeof(fpos_t); unsigned mbstate_t_sz = sizeof(mbstate_t); unsigned sigset_t_sz = sizeof(sigset_t); unsigned struct_timezone_sz = sizeof(struct timezone); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h index 4f892577d0b00..3758a9101c2a0 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h @@ -36,6 +36,7 @@ extern unsigned pid_t_sz; extern unsigned timeval_sz; extern unsigned uid_t_sz; extern unsigned gid_t_sz; +extern unsigned fpos_t_sz; extern unsigned mbstate_t_sz; extern unsigned struct_timezone_sz; extern unsigned struct_tms_sz; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp index dad7bde1498a7..7ea6134b702bf 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -135,6 +136,8 @@ namespace __sanitizer { unsigned struct_sioc_sg_req_sz = sizeof(struct sioc_sg_req); unsigned struct_sioc_vif_req_sz = sizeof(struct sioc_vif_req); + unsigned fpos_t_sz = sizeof(fpos_t); + const unsigned IOCTL_NOT_PRESENT = 0; unsigned IOCTL_FIOASYNC = FIOASYNC; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.h index 84a81265162c6..bf6586d27228f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_solaris.h @@ -418,6 +418,8 @@ extern unsigned struct_winsize_sz; extern unsigned struct_sioc_sg_req_sz; extern unsigned struct_sioc_vif_req_sz; +extern unsigned fpos_t_sz; + // ioctl request identifiers // A special value to mark ioctls that are not present on the target platform, diff --git a/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S b/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S index 2d0d8d8c19af4..ae02ada4032fd 100644 --- a/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S +++ b/compiler-rt/test/orc/TestCases/Darwin/x86-64/objc-imageinfo.S @@ -20,17 +20,17 @@ // NEW: MachOPlatform: Registered __objc_imageinfo for main // NEW-SAME: flags = 0x0040 // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \ -// RUN: %t/swift_4.o 2>&1 +// RUN: %t/swift_4.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_4 // SWIFT_4: MachOPlatform: Registered __objc_imageinfo for main // SWIFT_4-SAME: flags = 0x0640 // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \ -// RUN: %t/swift_5.o 2>&1 +// RUN: %t/swift_5.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_5 // SWIFT_5: MachOPlatform: Registered __objc_imageinfo for main // SWIFT_5-SAME: flags = 0x5000740 // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o \ -// RUN: %t/swift_59.o 2>&1 +// RUN: %t/swift_59.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_59 // SWIFT_59: MachOPlatform: Registered __objc_imageinfo for main // SWIFT_59-SAME: flags = 0x5090740 @@ -50,25 +50,25 @@ // Add swift to objc. // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \ -// RUN: %t/objc_new.o 2>&1 +// RUN: %t/objc_new.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_MIX2 // SWIFT_MIX2: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5090740 // Add multiple swift to objc. // RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/swift_59.o \ -// RUN: %t/swift_5.o %t/objc_new.o 2>&1 +// RUN: %t/swift_5.o %t/objc_new.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_MIX3 // SWIFT_MIX3: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x5000740 // Disable categories. -// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_old.o -// RUN: %t/objc_new.o 2>&1 +// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_old.o \ +// RUN: %t/objc_new.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_MIX4 // SWIFT_MIX4: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x0000 // Disable signed class_ro. -// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_new.o -// RUN: %t/objc_new_signed_ro.o 2>&1 +// RUN: %llvm_jitlink -num-threads=0 -debug-only=orc %t/main.o %t/objc_new.o \ +// RUN: %t/objc_new_signed_ro.o 2>&1 \ // RUN: | FileCheck %s -check-prefix=SWIFT_MIX5 // SWIFT_MIX5: MachOPlatform: Merging __objc_imageinfo flags for main {{.*}} -> 0x0040 diff --git a/flang/include/flang/Evaluate/target.h b/flang/include/flang/Evaluate/target.h index 154561ce868eb..e07f916b875e0 100644 --- a/flang/include/flang/Evaluate/target.h +++ b/flang/include/flang/Evaluate/target.h @@ -112,6 +112,9 @@ class TargetCharacteristics { bool isPPC() const { return isPPC_; } void set_isPPC(bool isPPC = false); + bool isSPARC() const { return isSPARC_; } + void set_isSPARC(bool isSPARC = false); + bool isOSWindows() const { return isOSWindows_; } void set_isOSWindows(bool isOSWindows = false) { isOSWindows_ = isOSWindows; @@ -126,6 +129,7 @@ class TargetCharacteristics { std::uint8_t align_[common::TypeCategory_enumSize][maxKind + 1]{}; bool isBigEndian_{false}; bool isPPC_{false}; + bool isSPARC_{false}; bool isOSWindows_{false}; bool haltingSupportIsUnknownAtCompileTime_{false}; bool areSubnormalsFlushedToZero_{false}; diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 18f84c7021e11..9c9c0609f4fc3 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -269,10 +269,8 @@ struct IntrinsicLibrary { mlir::Value genIeeeCopySign(mlir::Type, llvm::ArrayRef); void genIeeeGetFlag(llvm::ArrayRef); void genIeeeGetHaltingMode(llvm::ArrayRef); - template - void genIeeeGetOrSetModes(llvm::ArrayRef); - template - void genIeeeGetOrSetStatus(llvm::ArrayRef); + template + void genIeeeGetOrSetModesOrStatus(llvm::ArrayRef); void genIeeeGetRoundingMode(llvm::ArrayRef); void genIeeeGetUnderflowMode(llvm::ArrayRef); mlir::Value genIeeeInt(mlir::Type, llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h b/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h index f44e0c95ef6d4..7487444f3a7a9 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Exceptions.h @@ -33,5 +33,9 @@ mlir::Value genGetUnderflowMode(fir::FirOpBuilder &builder, mlir::Location loc); void genSetUnderflowMode(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value bit); +mlir::Value genGetModesTypeSize(fir::FirOpBuilder &builder, mlir::Location loc); +mlir::Value genGetStatusTypeSize(fir::FirOpBuilder &builder, + mlir::Location loc); + } // namespace fir::runtime #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_EXCEPTIONS_H diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td index 6f886726b1283..a270e69b39410 100644 --- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td +++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td @@ -335,4 +335,16 @@ def cuf_RegisterKernelOp : cuf_Op<"register_kernel", []> { }]; } +def cuf_DeviceAddressOp : cuf_Op<"device_address", []> { + let summary = "Get the device address from a host symbol"; + + let arguments = (ins SymbolRefAttr:$hostSymbol); + + let assemblyFormat = [{ + $hostSymbol attr-dict `->` type($addr) + }]; + + let results = (outs fir_ReferenceType:$addr); +} + #endif // FORTRAN_DIALECT_CUF_CUF_OPS diff --git a/flang/include/flang/Runtime/exceptions.h b/flang/include/flang/Runtime/exceptions.h index 483d0271bcab0..62c21f01c1289 100644 --- a/flang/include/flang/Runtime/exceptions.h +++ b/flang/include/flang/Runtime/exceptions.h @@ -13,6 +13,7 @@ #include "flang/Runtime/entry-names.h" #include +#include namespace Fortran::runtime { @@ -32,6 +33,10 @@ bool RTNAME(SupportHalting)(uint32_t except); bool RTNAME(GetUnderflowMode)(void); void RTNAME(SetUnderflowMode)(bool flag); +// Get the byte size of ieee_modes_type and ieee_status_type data. +std::size_t RTNAME(GetModesTypeSize)(void); +std::size_t RTNAME(GetStatusTypeSize)(void); + } // extern "C" } // namespace Fortran::runtime #endif // FORTRAN_RUNTIME_EXCEPTIONS_H_ diff --git a/flang/include/flang/Runtime/magic-numbers.h b/flang/include/flang/Runtime/magic-numbers.h index 1d3c5dca0b4bf..6788ba098bcf9 100644 --- a/flang/include/flang/Runtime/magic-numbers.h +++ b/flang/include/flang/Runtime/magic-numbers.h @@ -118,11 +118,10 @@ ieee_arithmetic module rounding procedures. #define _FORTRAN_RUNTIME_IEEE_OTHER 5 #if 0 -The size of derived types ieee_modes_type and ieee_status_type from intrinsic -module ieee_exceptions must be large enough to hold an fenv.h object of type -femode_t and fenv_t, respectively. These types have members that are declared -as int arrays with the following extents to allow build time validation of -these sizes in cross compilation environments. +INTEGER(kind=4) extents for ieee_exceptions module types ieee_modes_type and +ieee_status_type. These extent values are large enough to hold femode_t and +fenv_t data in many environments. An environment that does not meet these +size constraints may allocate memory with runtime size values. #endif #define _FORTRAN_RUNTIME_IEEE_FEMODE_T_EXTENT 2 #define _FORTRAN_RUNTIME_IEEE_FENV_T_EXTENT 8 diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h index 709c4bbe4b7b0..d1b0da3a42c89 100644 --- a/flang/include/flang/Tools/TargetSetup.h +++ b/flang/include/flang/Tools/TargetSetup.h @@ -71,6 +71,9 @@ namespace Fortran::tools { if (targetTriple.isPPC()) targetCharacteristics.set_isPPC(true); + if (targetTriple.isSPARC()) + targetCharacteristics.set_isSPARC(true); + if (targetTriple.isOSWindows()) targetCharacteristics.set_isOSWindows(true); diff --git a/flang/lib/Evaluate/target.cpp b/flang/lib/Evaluate/target.cpp index 409e28c767e1e..94dc35ecd5900 100644 --- a/flang/lib/Evaluate/target.cpp +++ b/flang/lib/Evaluate/target.cpp @@ -104,6 +104,7 @@ void TargetCharacteristics::set_isBigEndian(bool isBig) { } void TargetCharacteristics::set_isPPC(bool isPowerPC) { isPPC_ = isPowerPC; } +void TargetCharacteristics::set_isSPARC(bool isSPARC) { isSPARC_ = isSPARC; } void TargetCharacteristics::set_areSubnormalsFlushedToZero(bool yes) { areSubnormalsFlushedToZero_ = yes; diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index e6d0f044dcf84..f6f2e15e469e6 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -50,6 +50,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include // temporary -- only used in genIeeeGetOrSetModesOrStatus #include #include @@ -318,13 +319,15 @@ static constexpr IntrinsicHandler handlers[]{ {"ieee_get_halting_mode", &I::genIeeeGetHaltingMode, {{{"flag", asValue}, {"halting", asAddr}}}}, - {"ieee_get_modes", &I::genIeeeGetOrSetModes}, + {"ieee_get_modes", + &I::genIeeeGetOrSetModesOrStatus}, {"ieee_get_rounding_mode", &I::genIeeeGetRoundingMode, {{{"round_value", asAddr, handleDynamicOptional}, {"radix", asValue, handleDynamicOptional}}}, /*isElemental=*/false}, - {"ieee_get_status", &I::genIeeeGetOrSetStatus}, + {"ieee_get_status", + &I::genIeeeGetOrSetModesOrStatus}, {"ieee_get_underflow_mode", &I::genIeeeGetUnderflowMode, {{{"gradual", asAddr}}}, @@ -368,13 +371,15 @@ static constexpr IntrinsicHandler handlers[]{ {"ieee_set_flag", &I::genIeeeSetFlagOrHaltingMode}, {"ieee_set_halting_mode", &I::genIeeeSetFlagOrHaltingMode}, - {"ieee_set_modes", &I::genIeeeGetOrSetModes}, + {"ieee_set_modes", + &I::genIeeeGetOrSetModesOrStatus}, {"ieee_set_rounding_mode", &I::genIeeeSetRoundingMode, {{{"round_value", asValue, handleDynamicOptional}, {"radix", asValue, handleDynamicOptional}}}, /*isElemental=*/false}, - {"ieee_set_status", &I::genIeeeGetOrSetStatus}, + {"ieee_set_status", + &I::genIeeeGetOrSetModesOrStatus}, {"ieee_set_underflow_mode", &I::genIeeeSetUnderflowMode}, {"ieee_signaling_eq", &I::genIeeeSignalingCompare}, @@ -4108,11 +4113,12 @@ void IntrinsicLibrary::genRaiseExcept(int excepts, mlir::Value cond) { // Return a reference to the contents of a derived type with one field. // Also return the field type. static std::pair -getFieldRef(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value rec) { +getFieldRef(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value rec, + unsigned index = 0) { auto recType = mlir::dyn_cast(fir::unwrapPassByRefType(rec.getType())); - assert(recType.getTypeList().size() == 1 && "expected exactly one component"); - auto [fieldName, fieldTy] = recType.getTypeList().front(); + assert(index < recType.getTypeList().size() && "not enough components"); + auto [fieldName, fieldTy] = recType.getTypeList()[index]; mlir::Value field = builder.create( loc, fir::FieldType::get(recType.getContext()), fieldName, recType, fir::getTypeParams(rec)); @@ -4502,15 +4508,60 @@ void IntrinsicLibrary::genIeeeGetHaltingMode( } // IEEE_GET_MODES, IEEE_SET_MODES -template -void IntrinsicLibrary::genIeeeGetOrSetModes( +// IEEE_GET_STATUS, IEEE_SET_STATUS +template +void IntrinsicLibrary::genIeeeGetOrSetModesOrStatus( llvm::ArrayRef args) { assert(args.size() == 1); - mlir::Type ptrTy = builder.getRefType(builder.getIntegerType(32)); +#ifndef __GLIBC_USE_IEC_60559_BFP_EXT // only use of "#include " + // No definitions of fegetmode, fesetmode + llvm::StringRef func = isModes + ? (isGet ? "ieee_get_modes" : "ieee_set_modes") + : (isGet ? "ieee_get_status" : "ieee_set_status"); + TODO(loc, "intrinsic module procedure: " + func); +#else mlir::Type i32Ty = builder.getIntegerType(32); - mlir::Value addr = - builder.create(loc, ptrTy, getBase(args[0])); - genRuntimeCall(isGet ? "fegetmode" : "fesetmode", i32Ty, addr); + mlir::Type i64Ty = builder.getIntegerType(64); + mlir::Type ptrTy = builder.getRefType(i32Ty); + mlir::Value addr; + if (fir::getTargetTriple(builder.getModule()).isSPARC()) { + // Floating point environment data is larger than the __data field + // allotment. Allocate data space from the heap. + auto [fieldRef, fieldTy] = + getFieldRef(builder, loc, fir::getBase(args[0]), 1); + addr = builder.create( + loc, builder.create(loc, fieldRef)); + mlir::Type heapTy = addr.getType(); + mlir::Value allocated = builder.create( + loc, mlir::arith::CmpIPredicate::ne, + builder.createConvert(loc, i64Ty, addr), + builder.createIntegerConstant(loc, i64Ty, 0)); + auto ifOp = builder.create(loc, heapTy, allocated, + /*withElseRegion=*/true); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + builder.create(loc, addr); + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + mlir::Value byteSize = + isModes ? fir::runtime::genGetModesTypeSize(builder, loc) + : fir::runtime::genGetStatusTypeSize(builder, loc); + byteSize = builder.createConvert(loc, builder.getIndexType(), byteSize); + addr = + builder.create(loc, extractSequenceType(heapTy), + /*typeparams=*/std::nullopt, byteSize); + mlir::Value shape = builder.create(loc, byteSize); + builder.create( + loc, builder.create(loc, fieldTy, addr, shape), fieldRef); + builder.create(loc, addr); + builder.setInsertionPointAfter(ifOp); + addr = builder.create(loc, ptrTy, ifOp.getResult(0)); + } else { + // Place floating point environment data in __data storage. + addr = builder.create(loc, ptrTy, getBase(args[0])); + } + llvm::StringRef func = isModes ? (isGet ? "fegetmode" : "fesetmode") + : (isGet ? "fegetenv" : "fesetenv"); + genRuntimeCall(func, i32Ty, addr); +#endif } // Check that an explicit ieee_[get|set]_rounding_mode call radix value is 2. @@ -4543,18 +4594,6 @@ void IntrinsicLibrary::genIeeeGetRoundingMode( builder.create(loc, mode, fieldRef); } -// IEEE_GET_STATUS, IEEE_SET_STATUS -template -void IntrinsicLibrary::genIeeeGetOrSetStatus( - llvm::ArrayRef args) { - assert(args.size() == 1); - mlir::Type ptrTy = builder.getRefType(builder.getIntegerType(32)); - mlir::Type i32Ty = builder.getIntegerType(32); - mlir::Value addr = - builder.create(loc, ptrTy, getBase(args[0])); - genRuntimeCall(isGet ? "fegetenv" : "fesetenv", i32Ty, addr); -} - // IEEE_GET_UNDERFLOW_MODE void IntrinsicLibrary::genIeeeGetUnderflowMode( llvm::ArrayRef args) { diff --git a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp index 630281fdb593d..c545b3d00b4d7 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Exceptions.cpp @@ -42,3 +42,17 @@ void fir::runtime::genSetUnderflowMode(fir::FirOpBuilder &builder, fir::runtime::getRuntimeFunc(loc, builder)}; builder.create(loc, func, flag); } + +mlir::Value fir::runtime::genGetModesTypeSize(fir::FirOpBuilder &builder, + mlir::Location loc) { + mlir::func::FuncOp func{ + fir::runtime::getRuntimeFunc(loc, builder)}; + return builder.create(loc, func).getResult(0); +} + +mlir::Value fir::runtime::genGetStatusTypeSize(fir::FirOpBuilder &builder, + mlir::Location loc) { + mlir::func::FuncOp func{ + fir::runtime::getRuntimeFunc(loc, builder)}; + return builder.create(loc, func).getResult(0); +} diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index 2e6c272fa9089..5ce39f99bbb12 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -1,4 +1,4 @@ -//===-- CUFOpConversion.cpp -----------------------------------------------===// +//===-- CUFDeviceGlobal.cpp -----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index d61d9f63cb294..e93bed37d39f7 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -366,22 +366,47 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern { const fir::LLVMTypeConverter *typeConverter; }; -static mlir::Value genGetDeviceAddress(mlir::PatternRewriter &rewriter, - mlir::ModuleOp mod, mlir::Location loc, - mlir::Value inputArg) { - fir::FirOpBuilder builder(rewriter, mod); - mlir::func::FuncOp callee = - fir::runtime::getRuntimeFunc(loc, builder); - auto fTy = callee.getFunctionType(); - mlir::Value conv = createConvertOp(rewriter, loc, fTy.getInput(0), inputArg); - mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); - mlir::Value sourceLine = - fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); - llvm::SmallVector args{fir::runtime::createArguments( - builder, loc, fTy, conv, sourceFile, sourceLine)}; - auto call = rewriter.create(loc, callee, args); - return createConvertOp(rewriter, loc, inputArg.getType(), call->getResult(0)); -} +struct CUFDeviceAddressOpConversion + : public mlir::OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + CUFDeviceAddressOpConversion(mlir::MLIRContext *context, + const mlir::SymbolTable &symtab) + : OpRewritePattern(context), symTab{symtab} {} + + mlir::LogicalResult + matchAndRewrite(cuf::DeviceAddressOp op, + mlir::PatternRewriter &rewriter) const override { + if (auto global = symTab.lookup( + op.getHostSymbol().getRootReference().getValue())) { + auto mod = op->getParentOfType(); + mlir::Location loc = op.getLoc(); + auto hostAddr = rewriter.create( + loc, fir::ReferenceType::get(global.getType()), op.getHostSymbol()); + fir::FirOpBuilder builder(rewriter, mod); + mlir::func::FuncOp callee = + fir::runtime::getRuntimeFunc(loc, + builder); + auto fTy = callee.getFunctionType(); + mlir::Value conv = + createConvertOp(rewriter, loc, fTy.getInput(0), hostAddr); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(2)); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, conv, sourceFile, sourceLine)}; + auto call = rewriter.create(loc, callee, args); + mlir::Value addr = createConvertOp(rewriter, loc, hostAddr.getType(), + call->getResult(0)); + rewriter.replaceOp(op, addr.getDefiningOp()); + return success(); + } + return failure(); + } + +private: + const mlir::SymbolTable &symTab; +}; struct DeclareOpConversion : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -398,9 +423,8 @@ struct DeclareOpConversion : public mlir::OpRewritePattern { addrOfOp.getSymbol().getRootReference().getValue())) { if (cuf::isRegisteredDeviceGlobal(global)) { rewriter.setInsertionPointAfter(addrOfOp); - auto mod = op->getParentOfType(); - mlir::Value devAddr = genGetDeviceAddress(rewriter, mod, op.getLoc(), - addrOfOp.getResult()); + mlir::Value devAddr = rewriter.create( + op.getLoc(), addrOfOp.getType(), addrOfOp.getSymbol()); rewriter.startOpModification(op); op.getMemrefMutable().assign(devAddr); rewriter.finalizeOpModification(op); @@ -773,7 +797,6 @@ struct CUFLaunchOpConversion } } llvm::SmallVector args; - auto mod = op->getParentOfType(); for (mlir::Value arg : op.getArgs()) { // If the argument is a global descriptor, make sure we pass the device // copy of this descriptor and not the host one. @@ -785,8 +808,11 @@ struct CUFLaunchOpConversion if (auto global = symTab.lookup( addrOfOp.getSymbol().getRootReference().getValue())) { if (cuf::isRegisteredDeviceGlobal(global)) { - arg = genGetDeviceAddress(rewriter, mod, op.getLoc(), - declareOp.getResult()); + arg = rewriter + .create(op.getLoc(), + addrOfOp.getType(), + addrOfOp.getSymbol()) + .getResult(); } } } @@ -907,10 +933,12 @@ void cuf::populateCUFToFIRConversionPatterns( patterns.getContext()); patterns.insert(patterns.getContext(), symtab, &dl, &converter); - patterns.insert(patterns.getContext(), symtab); + patterns.insert( + patterns.getContext(), symtab); } void cuf::populateFIRCUFConversionPatterns(const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) { - patterns.insert(patterns.getContext(), symtab); + patterns.insert( + patterns.getContext(), symtab); } diff --git a/flang/module/__fortran_ieee_exceptions.f90 b/flang/module/__fortran_ieee_exceptions.f90 index 6691012eda238..3ac9b993186aa 100644 --- a/flang/module/__fortran_ieee_exceptions.f90 +++ b/flang/module/__fortran_ieee_exceptions.f90 @@ -36,13 +36,15 @@ ieee_all(*) = [ ieee_usual, ieee_underflow, ieee_inexact ] type, public :: ieee_modes_type ! Fortran 2018, 17.7 - private ! opaque fenv.h femode_t data + private ! opaque fenv.h femode_t data; code will access only one component integer(kind=4) :: __data(_FORTRAN_RUNTIME_IEEE_FEMODE_T_EXTENT) + integer(kind=1), allocatable :: __allocatable_data(:) end type ieee_modes_type type, public :: ieee_status_type ! Fortran 2018, 17.7 - private ! opaque fenv.h fenv_t data + private ! opaque fenv.h fenv_t data; code will access only one component integer(kind=4) :: __data(_FORTRAN_RUNTIME_IEEE_FENV_T_EXTENT) + integer(kind=1), allocatable :: __allocatable_data(:) end type ieee_status_type ! Define specifics with 1 LOGICAL or REAL argument for generic G. diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp index 2fa2baa2ec84a..f541b8e844ade 100644 --- a/flang/runtime/exceptions.cpp +++ b/flang/runtime/exceptions.cpp @@ -15,14 +15,10 @@ #include #endif -// When not supported, these macro are undefined in cfenv.h, -// set them to zero in that case. +// fenv.h may not define exception macros. #ifndef FE_INVALID #define FE_INVALID 0 #endif -#ifndef __FE_DENORM -#define __FE_DENORM 0 // denorm is nonstandard -#endif #ifndef FE_DIVBYZERO #define FE_DIVBYZERO 0 #endif @@ -46,7 +42,11 @@ uint32_t RTNAME(MapException)(uint32_t excepts) { Terminator terminator{__FILE__, __LINE__}; static constexpr uint32_t v{FE_INVALID}; - static constexpr uint32_t s{__FE_DENORM}; // subnormal +#if __x86_64__ + static constexpr uint32_t s{__FE_DENORM}; // nonstandard, not a #define +#else + static constexpr uint32_t s{0}; +#endif static constexpr uint32_t z{FE_DIVBYZERO}; static constexpr uint32_t o{FE_OVERFLOW}; static constexpr uint32_t u{FE_UNDERFLOW}; @@ -62,25 +62,13 @@ uint32_t RTNAME(MapException)(uint32_t excepts) { static constexpr uint32_t map[]{xm}; static constexpr uint32_t mapSize{sizeof(map) / sizeof(uint32_t)}; static_assert(mapSize == 64); - if (excepts == 0 || excepts >= mapSize) { + if (excepts >= mapSize) { terminator.Crash("Invalid excepts value: %d", excepts); } uint32_t except_value = map[excepts]; - if (except_value == 0) { - terminator.Crash( - "Excepts value %d not supported by flang runtime", excepts); - } return except_value; } -// Verify that the size of ieee_modes_type and ieee_status_type objects from -// intrinsic module file __fortran_ieee_exceptions.f90 are large enough to -// hold fenv_t object. -// TODO: fenv_t can be way larger than -// sizeof(int) * _FORTRAN_RUNTIME_IEEE_FENV_T_EXTENT -// on some systems, e.g. Solaris, so omit object size comparison for now. -// TODO: consider femode_t object size comparison once its more mature. - // Check if the processor has the ability to control whether to halt or // continue execution when a given exception is raised. bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) { @@ -103,7 +91,7 @@ bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) { } bool RTNAME(GetUnderflowMode)(void) { -#if __x86_64__ +#if _MM_FLUSH_ZERO_MASK // The MXCSR Flush to Zero flag is the negation of the ieee_get_underflow_mode // GRADUAL argument. It affects real computations of kinds 3, 4, and 8. return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF; @@ -112,12 +100,23 @@ bool RTNAME(GetUnderflowMode)(void) { #endif } void RTNAME(SetUnderflowMode)(bool flag) { -#if __x86_64__ +#if _MM_FLUSH_ZERO_MASK // The MXCSR Flush to Zero flag is the negation of the ieee_set_underflow_mode // GRADUAL argument. It affects real computations of kinds 3, 4, and 8. _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON); #endif } +size_t RTNAME(GetModesTypeSize)(void) { +#ifdef __GLIBC_USE_IEC_60559_BFP_EXT + return sizeof(femode_t); // byte size of ieee_modes_type data +#else + return 8; // femode_t is not defined +#endif +} +size_t RTNAME(GetStatusTypeSize)(void) { + return sizeof(fenv_t); // byte size of ieee_status_type data +} + } // extern "C" } // namespace Fortran::runtime diff --git a/flang/test/Fir/CUDA/cuda-data-transfer.fir b/flang/test/Fir/CUDA/cuda-data-transfer.fir index 7203c33e7eb11..5ed27f1be0a43 100644 --- a/flang/test/Fir/CUDA/cuda-data-transfer.fir +++ b/flang/test/Fir/CUDA/cuda-data-transfer.fir @@ -198,6 +198,7 @@ func.func @_QPsub8() attributes {fir.bindc_name = "t"} { // CHECK-LABEL: func.func @_QPsub8() // CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<5xi32> // CHECK: %[[LOCAL:.*]] = fir.declare %[[ALLOCA]] +// CHECK: fir.address_of(@_QMmtestsEn) : !fir.ref> // CHECK: %[[GBL:.*]] = fir.address_of(@_QMmtestsEn) : !fir.ref> // CHECK: %[[GBL_CONV:.*]] = fir.convert %[[GBL]] : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[GBL_CONV]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr @@ -222,6 +223,7 @@ func.func @_QPsub9() { // CHECK-LABEL: func.func @_QPsub9() // CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<5xi32> // CHECK: %[[LOCAL:.*]] = fir.declare %[[ALLOCA]] +// CHECK: fir.address_of(@_QMmtestsEn) : !fir.ref> // CHECK: %[[GBL:.*]] = fir.address_of(@_QMmtestsEn) : !fir.ref> // CHECK: %[[GBL_CONV:.*]] = fir.convert %[[GBL]] : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[GBL_CONV]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr @@ -380,6 +382,7 @@ func.func @_QPdevice_addr_conv() { } // CHECK-LABEL: func.func @_QPdevice_addr_conv() +// CHECK: fir.address_of(@_QMmod1Ea_dev) : !fir.ref> // CHECK: %[[GBL:.*]] = fir.address_of(@_QMmod1Ea_dev) : !fir.ref> // CHECK: %[[GBL_CONV:.*]] = fir.convert %[[GBL]] : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[GBL_CONV]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr diff --git a/flang/test/Fir/CUDA/cuda-global-addr.mlir b/flang/test/Fir/CUDA/cuda-global-addr.mlir index 94ee74736f650..0ccd0c797fb6f 100644 --- a/flang/test/Fir/CUDA/cuda-global-addr.mlir +++ b/flang/test/Fir/CUDA/cuda-global-addr.mlir @@ -26,6 +26,7 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} { } // CHECK-LABEL: func.func @_QQmain() +// CHECK: fir.address_of(@_QMmod1Eadev) : !fir.ref> // CHECK: %[[ADDR:.*]] = fir.address_of(@_QMmod1Eadev) : !fir.ref> // CHECK: %[[ADDRPTR:.*]] = fir.convert %[[ADDR]] : (!fir.ref>) -> !fir.llvm_ptr // CHECK: %[[DEVICE_ADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[ADDRPTR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir index 1e19b3bea1296..8432b9ec926e3 100644 --- a/flang/test/Fir/CUDA/cuda-launch.fir +++ b/flang/test/Fir/CUDA/cuda-launch.fir @@ -98,9 +98,9 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e } // CHECK-LABEL: func.func @_QQmain() +// CHECK: _FortranACUFSyncGlobalDescriptor // CHECK: %[[ADDROF:.*]] = fir.address_of(@_QMdevptrEdev_ptr) : !fir.ref>>> -// CHECK: %[[DECL:.*]] = fir.declare %[[ADDROF]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMdevptrEdev_ptr"} : (!fir.ref>>>) -> !fir.ref>>> -// CHECK: %[[CONV_DECL:.*]] = fir.convert %[[DECL]] : (!fir.ref>>>) -> !fir.llvm_ptr -// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_DECL]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr +// CHECK: %[[CONV_ADDR:.*]] = fir.convert %[[ADDROF]] : (!fir.ref>>>) -> !fir.llvm_ptr +// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_ADDR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr, !fir.ref, i32) -> !fir.llvm_ptr // CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr) -> !fir.ref>>> // CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref>>>) diff --git a/flang/test/Lower/Intrinsics/ieee_femodes.f90 b/flang/test/Lower/Intrinsics/ieee_femodes.f90 deleted file mode 100644 index abb264cb027ea..0000000000000 --- a/flang/test/Lower/Intrinsics/ieee_femodes.f90 +++ /dev/null @@ -1,82 +0,0 @@ -! RUN: bbc -emit-fir -o - %s | FileCheck %s - -! CHECK-LABEL: c.func @_QQmain -program m - use ieee_arithmetic - use ieee_exceptions - - ! CHECK: %[[VAL_69:.*]] = fir.alloca !fir.type<_QM__fortran_ieee_exceptionsTieee_modes_type{_QM__fortran_ieee_exceptionsTieee_modes_type.__data:!fir.array<2xi32>}> {bindc_name = "modes", uniq_name = "_QFEmodes"} - ! CHECK: %[[VAL_70:.*]] = fir.declare %[[VAL_69]] {uniq_name = "_QFEmodes"} : (!fir.ref}>>) -> !fir.ref}>> - type(ieee_modes_type) :: modes - - ! CHECK: %[[VAL_71:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}> {bindc_name = "round", uniq_name = "_QFEround"} - ! CHECK: %[[VAL_72:.*]] = fir.declare %[[VAL_71]] {uniq_name = "_QFEround"} : (!fir.ref>) -> !fir.ref> - type(ieee_round_type) :: round - - ! CHECK: %[[VAL_78:.*]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.0) : !fir.ref> - ! CHECK: %[[VAL_79:.*]] = fir.declare %[[VAL_78]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.0"} : (!fir.ref>) -> !fir.ref> - - ! CHECK: %[[VAL_80:.*]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_round_type.mode, !fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}> - ! CHECK: %[[VAL_81:.*]] = fir.coordinate_of %[[VAL_79]], %[[VAL_80]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[VAL_82:.*]] = fir.load %[[VAL_81]] : !fir.ref - ! CHECK: %[[VAL_83:.*]] = fir.convert %[[VAL_82]] : (i8) -> i32 - ! CHECK: fir.call @llvm.set.rounding(%[[VAL_83]]) fastmath : (i32) -> () - call ieee_set_rounding_mode(ieee_up) - - ! CHECK: %[[VAL_84:.*]] = fir.coordinate_of %[[VAL_72]], %[[VAL_80]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[VAL_85:.*]] = fir.call @llvm.get.rounding() fastmath : () -> i32 - ! CHECK: %[[VAL_86:.*]] = fir.convert %[[VAL_85]] : (i32) -> i8 - ! CHECK: fir.store %[[VAL_86]] to %[[VAL_84]] : !fir.ref - call ieee_get_rounding_mode(round) - - print*, 'rounding_mode [up ] : ', mode_name(round) - - ! CHECK: %[[VAL_103:.*]] = fir.convert %[[VAL_70]] : (!fir.ref}>>) -> !fir.ref - ! CHECK: %[[VAL_104:.*]] = fir.call @fegetmode(%[[VAL_103]]) fastmath : (!fir.ref) -> i32 - call ieee_get_modes(modes) - - ! CHECK: %[[VAL_105:.*]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.1) : !fir.ref> - ! CHECK: %[[VAL_106:.*]] = fir.declare %[[VAL_105]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro._QM__fortran_builtinsT__builtin_ieee_round_type.1"} : (!fir.ref>) -> !fir.ref> - ! CHECK: %[[VAL_107:.*]] = fir.coordinate_of %[[VAL_106]], %[[VAL_80]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[VAL_108:.*]] = fir.load %[[VAL_107]] : !fir.ref - ! CHECK: %[[VAL_109:.*]] = fir.convert %[[VAL_108]] : (i8) -> i32 - ! CHECK: fir.call @llvm.set.rounding(%[[VAL_109]]) fastmath : (i32) -> () - call ieee_set_rounding_mode(ieee_to_zero) - - ! CHECK: %[[VAL_110:.*]] = fir.call @llvm.get.rounding() fastmath : () -> i32 - ! CHECK: %[[VAL_111:.*]] = fir.convert %[[VAL_110]] : (i32) -> i8 - ! CHECK: fir.store %[[VAL_111]] to %[[VAL_84]] : !fir.ref - call ieee_get_rounding_mode(round) - - print*, 'rounding_mode [to_zero] : ', mode_name(round) - - ! CHECK: %[[VAL_126:.*]] = fir.call @fesetmode(%[[VAL_103]]) fastmath : (!fir.ref) -> i32 - call ieee_set_modes(modes) - - ! CHECK: %[[VAL_127:.*]] = fir.call @llvm.get.rounding() fastmath : () -> i32 - ! CHECK: %[[VAL_128:.*]] = fir.convert %[[VAL_127]] : (i32) -> i8 - ! CHECK: fir.store %[[VAL_128]] to %[[VAL_84]] : !fir.ref - call ieee_get_rounding_mode(round) - - print*, 'rounding_mode [up ] : ', mode_name(round) - -contains - character(7) function mode_name(m) - type(ieee_round_type), intent(in) :: m - if (m == ieee_nearest) then - mode_name = 'nearest' - else if (m == ieee_to_zero) then - mode_name = 'to_zero' - else if (m == ieee_up) then - mode_name = 'up' - else if (m == ieee_down) then - mode_name = 'down' - else if (m == ieee_away) then - mode_name = 'away' - else if (m == ieee_other) then - mode_name = 'other' - else - mode_name = '???' - endif - end -end diff --git a/flang/test/Lower/Intrinsics/ieee_festatus.f90 b/flang/test/Lower/Intrinsics/ieee_festatus.f90 deleted file mode 100644 index 66b1472101ef7..0000000000000 --- a/flang/test/Lower/Intrinsics/ieee_festatus.f90 +++ /dev/null @@ -1,120 +0,0 @@ -! RUN: bbc -emit-fir -o - %s | FileCheck %s - -! CHECK-LABEL: c.func @_QQmain -program s - use ieee_arithmetic - - ! CHECK: %[[V_0:[0-9]+]] = fir.address_of(@_QM__fortran_ieee_exceptionsECieee_all) : !fir.ref>> - ! CHECK: %[[V_1:[0-9]+]] = fir.shape %c5{{.*}} : (index) -> !fir.shape<1> - ! CHECK: %[[V_2:[0-9]+]] = fir.declare %[[V_0]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QM__fortran_ieee_exceptionsECieee_all"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: %[[V_53:[0-9]+]] = fir.address_of(@_QM__fortran_ieee_exceptionsECieee_usual) : !fir.ref>> - ! CHECK: %[[V_54:[0-9]+]] = fir.shape %c3{{.*}} : (index) -> !fir.shape<1> - ! CHECK: %[[V_55:[0-9]+]] = fir.declare %[[V_53]](%[[V_54]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QM__fortran_ieee_exceptionsECieee_usual"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - use ieee_exceptions - - ! CHECK: %[[V_56:[0-9]+]] = fir.alloca !fir.type<_QM__fortran_ieee_exceptionsTieee_status_type{_QM__fortran_ieee_exceptionsTieee_status_type.__data:!fir.array<8xi32>}> {bindc_name = "status", uniq_name = "_QFEstatus"} - ! CHECK: %[[V_57:[0-9]+]] = fir.declare %[[V_56]] {uniq_name = "_QFEstatus"} : (!fir.ref}>>) -> !fir.ref}>> - type(ieee_status_type) :: status - - ! CHECK: %[[V_58:[0-9]+]] = fir.alloca !fir.array<5x!fir.logical<4>> {bindc_name = "v", uniq_name = "_QFEv"} - ! CHECK: %[[V_59:[0-9]+]] = fir.declare %[[V_58]](%[[V_1]]) {uniq_name = "_QFEv"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - logical :: v(size(ieee_all)) - - ! CHECK: %[[V_60:[0-9]+]] = fir.address_of(@_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0) : !fir.ref>> - ! CHECK: %[[V_61:[0-9]+]] = fir.declare %[[V_60]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c5{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_61]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_97:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_96]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_98:[0-9]+]] = fir.load %[[V_97]] : !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.convert %[[V_98]] : (i8) -> i32 - ! CHECK: %[[V_100:[0-9]+]] = fir.call @_FortranAMapException(%[[V_99]]) fastmath : (i32) -> i32 - ! CHECK: fir.if %true{{[_0-9]*}} { - ! CHECK: %[[V_101:[0-9]+]] = fir.call @feenableexcept(%[[V_100]]) fastmath : (i32) -> i32 - ! CHECK: } else { - ! CHECK: %[[V_101:[0-9]+]] = fir.call @fedisableexcept(%[[V_100]]) fastmath : (i32) -> i32 - ! CHECK: } - ! CHECK: } - call ieee_set_halting_mode(ieee_all, .true.) - - ! CHECK: %[[V_62:[0-9]+]] = fir.declare %[[V_60]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c5{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_62]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.array_coor %[[V_59]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_97:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_98:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_97]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.load %[[V_98]] : !fir.ref - ! CHECK: %[[V_100:[0-9]+]] = fir.call @fegetexcept() fastmath : () -> i32 - ! CHECK: %[[V_101:[0-9]+]] = fir.convert %[[V_99]] : (i8) -> i32 - ! CHECK: %[[V_102:[0-9]+]] = fir.call @_FortranAMapException(%[[V_101]]) fastmath : (i32) -> i32 - ! CHECK: %[[V_103:[0-9]+]] = arith.andi %[[V_100]], %[[V_102]] : i32 - ! CHECK: %[[V_104:[0-9]+]] = arith.cmpi ne, %[[V_103]], %c0{{.*}} : i32 - ! CHECK: %[[V_105:[0-9]+]] = fir.convert %[[V_104]] : (i1) -> !fir.logical<4> - ! CHECK: fir.store %[[V_105]] to %[[V_96]] : !fir.ref> - ! CHECK: } - call ieee_get_halting_mode(ieee_all, v) - - print*, 'halting_mode [T T T T T] :', v - - ! CHECK: %[[V_75:[0-9]+]] = fir.convert %[[V_57]] : (!fir.ref}>>) -> !fir.ref - ! CHECK: %[[V_76:[0-9]+]] = fir.call @fegetenv(%[[V_75]]) fastmath : (!fir.ref) -> i32 - call ieee_get_status(status) - - ! CHECK: %[[V_77:[0-9]+]] = fir.address_of(@_QQro.3x_QM__fortran_builtinsT__builtin_ieee_flag_type.1) : !fir.ref>> - ! CHECK: %[[V_78:[0-9]+]] = fir.declare %[[V_77]](%[[V_54]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.3x_QM__fortran_builtinsT__builtin_ieee_flag_type.1"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c3{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_78]](%[[V_54]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_97:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_96]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_98:[0-9]+]] = fir.load %[[V_97]] : !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.convert %[[V_98]] : (i8) -> i32 - ! CHECK: %[[V_100:[0-9]+]] = fir.call @_FortranAMapException(%[[V_99]]) fastmath : (i32) -> i32 - ! CHECK: fir.if %false{{[_0-9]*}} { - ! CHECK: %[[V_101:[0-9]+]] = fir.call @feenableexcept(%[[V_100]]) fastmath : (i32) -> i32 - ! CHECK: } else { - ! CHECK: %[[V_101:[0-9]+]] = fir.call @fedisableexcept(%[[V_100]]) fastmath : (i32) -> i32 - ! CHECK: } - ! CHECK: } - call ieee_set_halting_mode(ieee_usual, .false.) - - ! CHECK: %[[V_79:[0-9]+]] = fir.declare %[[V_60]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c5{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_79]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.array_coor %[[V_59]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_97:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_98:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_97]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.load %[[V_98]] : !fir.ref - ! CHECK: %[[V_100:[0-9]+]] = fir.call @fegetexcept() fastmath : () -> i32 - ! CHECK: %[[V_101:[0-9]+]] = fir.convert %[[V_99]] : (i8) -> i32 - ! CHECK: %[[V_102:[0-9]+]] = fir.call @_FortranAMapException(%[[V_101]]) fastmath : (i32) -> i32 - ! CHECK: %[[V_103:[0-9]+]] = arith.andi %[[V_100]], %[[V_102]] : i32 - ! CHECK: %[[V_104:[0-9]+]] = arith.cmpi ne, %[[V_103]], %c0{{.*}} : i32 - ! CHECK: %[[V_105:[0-9]+]] = fir.convert %[[V_104]] : (i1) -> !fir.logical<4> - ! CHECK: fir.store %[[V_105]] to %[[V_96]] : !fir.ref> - ! CHECK: } - call ieee_get_halting_mode(ieee_all, v) - - print*, 'halting_mode [F F F T T] :', v - - ! CHECK: %[[V_87:[0-9]+]] = fir.call @fesetenv(%[[V_75]]) fastmath : (!fir.ref) -> i32 - ! CHECK: %[[V_88:[0-9]+]] = fir.declare %[[V_60]](%[[V_1]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QQro.5x_QM__fortran_builtinsT__builtin_ieee_flag_type.0"} : (!fir.ref>>, !fir.shape<1>) -> !fir.ref>> - call ieee_set_status(status) - - ! CHECK: fir.do_loop %arg0 = %c1{{.*}} to %c5{{.*}} step %c1{{.*}} { - ! CHECK: %[[V_95:[0-9]+]] = fir.array_coor %[[V_88]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_96:[0-9]+]] = fir.array_coor %[[V_59]](%[[V_1]]) %arg0 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.ref> - ! CHECK: %[[V_97:[0-9]+]] = fir.field_index _QM__fortran_builtinsT__builtin_ieee_flag_type.flag, !fir.type<_QM__fortran_builtinsT__builtin_ieee_flag_type{_QM__fortran_builtinsT__builtin_ieee_flag_type.flag:i8}> - ! CHECK: %[[V_98:[0-9]+]] = fir.coordinate_of %[[V_95]], %[[V_97]] : (!fir.ref>, !fir.field) -> !fir.ref - ! CHECK: %[[V_99:[0-9]+]] = fir.load %[[V_98]] : !fir.ref - ! CHECK: %[[V_100:[0-9]+]] = fir.call @fegetexcept() fastmath : () -> i32 - ! CHECK: %[[V_101:[0-9]+]] = fir.convert %[[V_99]] : (i8) -> i32 - ! CHECK: %[[V_102:[0-9]+]] = fir.call @_FortranAMapException(%[[V_101]]) fastmath : (i32) -> i32 - ! CHECK: %[[V_103:[0-9]+]] = arith.andi %[[V_100]], %[[V_102]] : i32 - ! CHECK: %[[V_104:[0-9]+]] = arith.cmpi ne, %[[V_103]], %c0{{.*}} : i32 - ! CHECK: %[[V_105:[0-9]+]] = fir.convert %[[V_104]] : (i1) -> !fir.logical<4> - ! CHECK: fir.store %[[V_105]] to %[[V_96]] : !fir.ref> - ! CHECK: } - call ieee_get_halting_mode(ieee_all, v) - - print*, 'halting_mode [T T T T T] :', v -end diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index fc2b0e91c1286..f5ba341411768 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -322,6 +322,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.fsync libc.src.unistd.ftruncate libc.src.unistd.getcwd + libc.src.unistd.getentropy libc.src.unistd.geteuid libc.src.unistd.getpid libc.src.unistd.getppid diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 723853b2230ae..0c1ae9561a7e6 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -321,6 +321,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.fsync libc.src.unistd.ftruncate libc.src.unistd.getcwd + libc.src.unistd.getentropy libc.src.unistd.geteuid libc.src.unistd.getpid libc.src.unistd.getppid diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt index 4ecc3ada9c768..aad320995d339 100644 --- a/libc/config/windows/entrypoints.txt +++ b/libc/config/windows/entrypoints.txt @@ -101,6 +101,9 @@ set(TARGET_LIBC_ENTRYPOINTS # time.h entrypoints libc.src.time.time libc.src.time.clock_getres + + # unistd.h entrypoints + libc.src.unistd.getentropy ) set(TARGET_LIBM_ENTRYPOINTS diff --git a/libc/config/windows/headers.txt b/libc/config/windows/headers.txt index bccc04f7697e5..6d9aae9276924 100644 --- a/libc/config/windows/headers.txt +++ b/libc/config/windows/headers.txt @@ -6,4 +6,5 @@ set(TARGET_PUBLIC_HEADERS libc.include.errno libc.include.fenv libc.include.math + libc.include.unistd ) diff --git a/libc/include/sys/random.yaml b/libc/include/sys/random.yaml index 4efb2fbb44733..a97266a5481df 100644 --- a/libc/include/sys/random.yaml +++ b/libc/include/sys/random.yaml @@ -15,3 +15,10 @@ functions: - type: void * - type: size_t - type: unsigned int + - name: getentropy + standards: + - GNUExtensions + return_type: int + arguments: + - type: void * + - type: size_t diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml index fada365e0103d..c1901be446fe5 100644 --- a/libc/include/unistd.yaml +++ b/libc/include/unistd.yaml @@ -128,6 +128,13 @@ functions: arguments: - type: char * - type: size_t + - name: getentropy + standards: + - GNUExtensions + return_type: int + arguments: + - type: void * + - type: size_t - name: geteuid standards: - POSIX diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt index 32308ba147940..41183429f67a7 100644 --- a/libc/src/CMakeLists.txt +++ b/libc/src/CMakeLists.txt @@ -15,6 +15,7 @@ add_subdirectory(string) add_subdirectory(strings) add_subdirectory(wchar) add_subdirectory(time) +add_subdirectory(unistd) if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(dirent) @@ -23,7 +24,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(sched) add_subdirectory(sys) add_subdirectory(termios) - add_subdirectory(unistd) endif() if(NOT LLVM_LIBC_FULL_BUILD) diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt index 1a0b2e3293d03..6bdea0c7693bd 100644 --- a/libc/src/unistd/CMakeLists.txt +++ b/libc/src/unistd/CMakeLists.txt @@ -350,3 +350,10 @@ add_entrypoint_object( DEPENDS libc.src.__support.threads.identifier ) + +add_entrypoint_object( + getentropy + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.getentropy +) diff --git a/libc/src/unistd/getentropy.h b/libc/src/unistd/getentropy.h new file mode 100644 index 0000000000000..27e13d2352d81 --- /dev/null +++ b/libc/src/unistd/getentropy.h @@ -0,0 +1,19 @@ +//===-- Implementation header for getentropy ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/size_t.h" +#include "src/__support/common.h" + +#ifndef LLVM_LIBC_SRC_UNISTD_GETENTROPY_H +#define LLVM_LIBC_SRC_UNISTD_GETENTROPY_H + +namespace LIBC_NAMESPACE_DECL { +int getentropy(void *buffer, size_t length); +} + +#endif // LLVM_LIBC_SRC_UNISTD_GETENTROPY_H diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt index ed360c73354ac..2bb17f56f7b32 100644 --- a/libc/src/unistd/linux/CMakeLists.txt +++ b/libc/src/unistd/linux/CMakeLists.txt @@ -570,3 +570,18 @@ add_entrypoint_object( libc.src.__support.OSUtil.osutil libc.src.errno.errno ) + +add_entrypoint_object( + getentropy + SRCS + getentropy.cpp + HDRS + ../getentropy.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.ssize_t + libc.hdr.errno_macros + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) diff --git a/libc/src/unistd/linux/getentropy.cpp b/libc/src/unistd/linux/getentropy.cpp new file mode 100644 index 0000000000000..168a1197734ed --- /dev/null +++ b/libc/src/unistd/linux/getentropy.cpp @@ -0,0 +1,51 @@ +//===-- Linux implementation of getentropy --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/getentropy.h" +#include "hdr/errno_macros.h" +#include "src/__support/OSUtil/syscall.h" +#include "src/__support/common.h" +#include "src/errno/libc_errno.h" + +#include // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { +LLVM_LIBC_FUNCTION(int, getentropy, (void *buffer, size_t length)) { + // check the length limit + if (length > 256) { + libc_errno = EIO; + return -1; + } + + char *cursor = static_cast(buffer); + while (length != 0) { + // 0 flag means urandom and blocking, which meets the assumption of + // getentropy + auto ret = syscall_impl(SYS_getrandom, cursor, length, 0); + + // on success, advance the buffer pointer + if (ret >= 0) { + length -= static_cast(ret); + cursor += ret; + continue; + } + + auto error = -static_cast(ret); + + // on EINTR, try again + if (error == EINTR) + continue; + + // on ENOSYS, forward errno and exit; + // otherwise, set EIO and exit + libc_errno = (error == ENOSYS) ? ENOSYS : EIO; + return -1; + } + return 0; +} +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/windows/CMakeLists.txt b/libc/src/unistd/windows/CMakeLists.txt new file mode 100644 index 0000000000000..195d98cdb51d4 --- /dev/null +++ b/libc/src/unistd/windows/CMakeLists.txt @@ -0,0 +1,11 @@ +add_entrypoint_object( + getentropy + SRCS + getentropy.cpp + HDRS + ../getentropy.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.errno_macros + libc.src.errno.errno +) diff --git a/libc/src/unistd/windows/getentropy.cpp b/libc/src/unistd/windows/getentropy.cpp new file mode 100644 index 0000000000000..bfaec723ac63d --- /dev/null +++ b/libc/src/unistd/windows/getentropy.cpp @@ -0,0 +1,42 @@ +//===-- Windows implementation of getentropy ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/getentropy.h" +#include "hdr/errno_macros.h" +#include "src/__support/common.h" +#include "src/errno/libc_errno.h" + +#define WIN32_LEAN_AND_MEAN +#include +#include +#include +#pragma comment(lib, "bcrypt.lib") + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, getentropy, (void *buffer, size_t length)) { + __try { + // check the length limit + if (length > 256) + __leave; + + NTSTATUS result = ::BCryptGenRandom(nullptr, static_cast(buffer), + static_cast(length), + BCRYPT_USE_SYSTEM_PREFERRED_RNG); + + if (result == STATUS_SUCCESS) + return 0; + + } __except (EXCEPTION_EXECUTE_HANDLER) { + // no need to handle exceptions specially + } + + libc_errno = EIO; + return -1; +} +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index 31008508d6492..22ec43588f744 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -61,6 +61,7 @@ add_subdirectory(string) add_subdirectory(strings) add_subdirectory(wchar) add_subdirectory(time) +add_subdirectory(unistd) # Depends on utilities in stdlib add_subdirectory(inttypes) @@ -70,7 +71,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(sched) add_subdirectory(sys) add_subdirectory(termios) - add_subdirectory(unistd) endif() if(NOT LLVM_LIBC_FULL_BUILD) diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index e036e09cde702..c3eebdf2a877d 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -488,6 +488,18 @@ add_libc_test( libc.src.stdio.fflush ) +add_libc_test( + getentropy_test + SUITE + libc_unistd_unittests + SRCS + getentropy_test.cpp + DEPENDS + libc.src.unistd.getentropy + libc.src.errno.errno + libc.test.UnitTest.ErrnoSetterMatcher +) + if(LLVM_LIBC_FULL_BUILD) add_libc_test( _exit_test diff --git a/libc/test/src/unistd/getentropy_test.cpp b/libc/test/src/unistd/getentropy_test.cpp new file mode 100644 index 0000000000000..f7329ae419327 --- /dev/null +++ b/libc/test/src/unistd/getentropy_test.cpp @@ -0,0 +1,28 @@ +//===-- Unittests for getentropy ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/errno_macros.h" +#include "src/unistd/getentropy.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher; + +TEST(LlvmLibcUnistdGetEntropyTest, LengthTooLong) { + char buf[1024]; + ASSERT_THAT(LIBC_NAMESPACE::getentropy(buf, 257), Fails(EIO)); +} + +TEST(LlvmLibcUnistdGetEntropyTest, SmokeTest) { + char buf[256]; + ASSERT_THAT(LIBC_NAMESPACE::getentropy(buf, 256), Succeeds()); +} + +TEST(LlvmLibcUnistdGetEntropyTest, OtherError) { + ASSERT_THAT(LIBC_NAMESPACE::getentropy(nullptr, 1), Fails(EIO)); +} diff --git a/libclc/generic/include/clc/geometric/floatn.inc b/libclc/clc/include/clc/geometric/floatn.inc similarity index 96% rename from libclc/generic/include/clc/geometric/floatn.inc rename to libclc/clc/include/clc/geometric/floatn.inc index 49c797f9f1845..919c2cadbff4f 100644 --- a/libclc/generic/include/clc/geometric/floatn.inc +++ b/libclc/clc/include/clc/geometric/floatn.inc @@ -1,3 +1,6 @@ +#include +#include + #define __CLC_FLOAT float #define __CLC_FPSIZE 32 diff --git a/libclc/clc/include/clc/integer/gentype.inc b/libclc/clc/include/clc/integer/gentype.inc index 2c8dd143db879..98682a6d32c70 100644 --- a/libclc/clc/include/clc/integer/gentype.inc +++ b/libclc/clc/include/clc/integer/gentype.inc @@ -1,3 +1,6 @@ +#include +#include + // These 2 defines only change when switching between data sizes or base types // to keep this file manageable. #define __CLC_GENSIZE 8 diff --git a/libclc/clc/include/clc/math/gentype.inc b/libclc/clc/include/clc/math/gentype.inc index 966b4269f66c1..87719f2d9bc0e 100644 --- a/libclc/clc/include/clc/math/gentype.inc +++ b/libclc/clc/include/clc/math/gentype.inc @@ -1,3 +1,6 @@ +#include +#include + #define __CLC_SCALAR_GENTYPE float #define __CLC_FPSIZE 32 diff --git a/libclc/clc/include/clc/math/unary_intrin.inc b/libclc/clc/include/clc/math/unary_intrin.inc index c331d3ff08a61..5ea2246244bef 100644 --- a/libclc/clc/include/clc/math/unary_intrin.inc +++ b/libclc/clc/include/clc/math/unary_intrin.inc @@ -1,3 +1,6 @@ +#include +#include + _CLC_OVERLOAD float __CLC_FUNCTION(float f) __asm(__CLC_INTRINSIC ".f32"); _CLC_OVERLOAD float2 __CLC_FUNCTION(float2 f) __asm(__CLC_INTRINSIC ".v2f32"); _CLC_OVERLOAD float3 __CLC_FUNCTION(float3 f) __asm(__CLC_INTRINSIC ".v3f32"); diff --git a/libclc/clc/include/clc/relational/clc_all.h b/libclc/clc/include/clc/relational/clc_all.h index bf068105aa1be..7be3d132dd53d 100644 --- a/libclc/clc/include/clc/relational/clc_all.h +++ b/libclc/clc/include/clc/relational/clc_all.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ALL_DECL(TYPE) _CLC_OVERLOAD _CLC_DECL int __clc_all(TYPE v); diff --git a/libclc/clc/include/clc/relational/clc_any.h b/libclc/clc/include/clc/relational/clc_any.h index f947b77e08341..27dbffeb2eecd 100644 --- a/libclc/clc/include/clc/relational/clc_any.h +++ b/libclc/clc/include/clc/relational/clc_any.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ANY_DECL(TYPE) _CLC_OVERLOAD _CLC_DECL int __clc_any(TYPE v); diff --git a/libclc/clc/include/clc/relational/clc_isequal.h b/libclc/clc/include/clc/relational/clc_isequal.h index 3a36ea24fd299..0f31fb9530a14 100644 --- a/libclc/clc/include/clc/relational/clc_isequal.h +++ b/libclc/clc/include/clc/relational/clc_isequal.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ISEQUAL_DECL(TYPE, RETTYPE) \ _CLC_OVERLOAD _CLC_DECL RETTYPE __clc_isequal(TYPE x, TYPE y); diff --git a/libclc/clc/include/clc/relational/clc_isinf.h b/libclc/clc/include/clc/relational/clc_isinf.h index c33ef9bb9527d..3f60bec5654a2 100644 --- a/libclc/clc/include/clc/relational/clc_isinf.h +++ b/libclc/clc/include/clc/relational/clc_isinf.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ISINF_DECL(RET_TYPE, ARG_TYPE) \ _CLC_OVERLOAD _CLC_DECL RET_TYPE __clc_isinf(ARG_TYPE); diff --git a/libclc/clc/include/clc/relational/clc_isnan.h b/libclc/clc/include/clc/relational/clc_isnan.h index 08351eb5515f9..3200e593c5cff 100644 --- a/libclc/clc/include/clc/relational/clc_isnan.h +++ b/libclc/clc/include/clc/relational/clc_isnan.h @@ -7,6 +7,7 @@ #else #include +#include #define _CLC_ISNAN_DECL(RET_TYPE, ARG_TYPE) \ _CLC_OVERLOAD _CLC_DECL RET_TYPE __clc_isnan(ARG_TYPE); diff --git a/libclc/clc/include/clc/relational/floatn.inc b/libclc/clc/include/clc/relational/floatn.inc index fc0d6878b4aa7..18fb20bd9effe 100644 --- a/libclc/clc/include/clc/relational/floatn.inc +++ b/libclc/clc/include/clc/relational/floatn.inc @@ -1,3 +1,5 @@ +#include +#include #define __CLC_FLOATN float #define __CLC_INTN int diff --git a/libclc/clc/include/clc/shared/clc_clamp.h b/libclc/clc/include/clc/shared/clc_clamp.h index a84184c1750a5..d9d39413c5618 100644 --- a/libclc/clc/include/clc/shared/clc_clamp.h +++ b/libclc/clc/include/clc/shared/clc_clamp.h @@ -6,9 +6,6 @@ #define __clc_clamp clamp #else -#include -#include - #define __CLC_BODY #include diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 0d89457046a50..6af6b4f730766 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -37,7 +37,6 @@ #include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/GlobPattern.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Parallel.h" @@ -172,23 +171,10 @@ static std::future createFutureForFile(std::string path) { }); } -// Symbol names are mangled by prepending "_" on x86. -StringRef LinkerDriver::mangle(StringRef sym) { - assert(ctx.config.machine != IMAGE_FILE_MACHINE_UNKNOWN); - if (ctx.config.machine == I386) - return saver().save("_" + sym); - return sym; -} - llvm::Triple::ArchType LinkerDriver::getArch() { return getMachineArchType(ctx.config.machine); } -bool LinkerDriver::findUnderscoreMangle(StringRef sym) { - Symbol *s = ctx.symtab.findMangle(mangle(sym)); - return s && !isa(s); -} - static bool compatibleMachineType(COFFLinkerContext &ctx, MachineTypes mt) { if (mt == IMAGE_FILE_MACHINE_UNKNOWN) return true; @@ -486,7 +472,7 @@ void LinkerDriver::parseDirectives(InputFile *file) { SmallVector vec; e.split(vec, ','); for (StringRef sym : vec) - excludedSymbols.insert(mangle(sym)); + excludedSymbols.insert(file->symtab.mangle(sym)); } // https://docs.microsoft.com/en-us/cpp/preprocessor/comment-c-cpp?view=msvc-160 @@ -505,7 +491,8 @@ void LinkerDriver::parseDirectives(InputFile *file) { case OPT_entry: if (!arg->getValue()[0]) Fatal(ctx) << "missing entry point symbol name"; - ctx.config.entry = file->symtab.addGCRoot(mangle(arg->getValue()), true); + ctx.config.entry = + file->symtab.addGCRoot(file->symtab.mangle(arg->getValue()), true); break; case OPT_failifmismatch: checkFailIfMismatch(arg->getValue(), file); @@ -805,97 +792,6 @@ void LinkerDriver::addLibSearchPaths() { } } -void LinkerDriver::addUndefinedGlob(StringRef arg) { - Expected pat = GlobPattern::create(arg); - if (!pat) { - Err(ctx) << "/includeglob: " << toString(pat.takeError()); - return; - } - - SmallVector syms; - ctx.symtab.forEachSymbol([&syms, &pat](Symbol *sym) { - if (pat->match(sym->getName())) { - syms.push_back(sym); - } - }); - - for (Symbol *sym : syms) - ctx.symtab.addGCRoot(sym->getName()); -} - -StringRef LinkerDriver::mangleMaybe(Symbol *s) { - // If the plain symbol name has already been resolved, do nothing. - Undefined *unmangled = dyn_cast(s); - if (!unmangled) - return ""; - - // Otherwise, see if a similar, mangled symbol exists in the symbol table. - Symbol *mangled = ctx.symtab.findMangle(unmangled->getName()); - if (!mangled) - return ""; - - // If we find a similar mangled symbol, make this an alias to it and return - // its name. - Log(ctx) << unmangled->getName() << " aliased to " << mangled->getName(); - unmangled->setWeakAlias(ctx.symtab.addUndefined(mangled->getName())); - return mangled->getName(); -} - -// Windows specific -- find default entry point name. -// -// There are four different entry point functions for Windows executables, -// each of which corresponds to a user-defined "main" function. This function -// infers an entry point from a user-defined "main" function. -StringRef LinkerDriver::findDefaultEntry() { - assert(ctx.config.subsystem != IMAGE_SUBSYSTEM_UNKNOWN && - "must handle /subsystem before calling this"); - - if (ctx.config.mingw) - return mangle(ctx.config.subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI - ? "WinMainCRTStartup" - : "mainCRTStartup"); - - if (ctx.config.subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI) { - if (findUnderscoreMangle("wWinMain")) { - if (!findUnderscoreMangle("WinMain")) - return mangle("wWinMainCRTStartup"); - Warn(ctx) << "found both wWinMain and WinMain; using latter"; - } - return mangle("WinMainCRTStartup"); - } - if (findUnderscoreMangle("wmain")) { - if (!findUnderscoreMangle("main")) - return mangle("wmainCRTStartup"); - Warn(ctx) << "found both wmain and main; using latter"; - } - return mangle("mainCRTStartup"); -} - -WindowsSubsystem LinkerDriver::inferSubsystem() { - if (ctx.config.dll) - return IMAGE_SUBSYSTEM_WINDOWS_GUI; - if (ctx.config.mingw) - return IMAGE_SUBSYSTEM_WINDOWS_CUI; - // Note that link.exe infers the subsystem from the presence of these - // functions even if /entry: or /nodefaultlib are passed which causes them - // to not be called. - bool haveMain = findUnderscoreMangle("main"); - bool haveWMain = findUnderscoreMangle("wmain"); - bool haveWinMain = findUnderscoreMangle("WinMain"); - bool haveWWinMain = findUnderscoreMangle("wWinMain"); - if (haveMain || haveWMain) { - if (haveWinMain || haveWWinMain) { - Warn(ctx) << "found " << (haveMain ? "main" : "wmain") << " and " - << (haveWinMain ? "WinMain" : "wWinMain") - << "; defaulting to /subsystem:console"; - } - return IMAGE_SUBSYSTEM_WINDOWS_CUI; - } - if (haveWinMain || haveWWinMain) - return IMAGE_SUBSYSTEM_WINDOWS_GUI; - return IMAGE_SUBSYSTEM_UNKNOWN; -} - uint64_t LinkerDriver::getDefaultImageBase() { if (ctx.config.is64()) return ctx.config.dll ? 0x180000000 : 0x140000000; @@ -1539,7 +1435,7 @@ void LinkerDriver::maybeExportMinGWSymbols(const opt::InputArgList &args) { SmallVector vec; StringRef(arg->getValue()).split(vec, ','); for (StringRef sym : vec) - exporter.addExcludedSymbol(mangle(sym)); + exporter.addExcludedSymbol(ctx.symtab.mangle(sym)); } ctx.symtab.forEachSymbol([&](Symbol *s) { @@ -2455,7 +2351,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // and after the early return when just writing an import library. if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN) { llvm::TimeTraceScope timeScope("Infer subsystem"); - config->subsystem = inferSubsystem(); + config->subsystem = ctx.symtab.inferSubsystem(); if (config->subsystem == IMAGE_SUBSYSTEM_UNKNOWN) Fatal(ctx) << "subsystem must be defined"; } @@ -2466,7 +2362,8 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (auto *arg = args.getLastArg(OPT_entry)) { if (!arg->getValue()[0]) Fatal(ctx) << "missing entry point symbol name"; - config->entry = ctx.symtab.addGCRoot(mangle(arg->getValue()), true); + config->entry = + ctx.symtab.addGCRoot(ctx.symtab.mangle(arg->getValue()), true); } else if (!config->entry && !config->noEntry) { if (args.hasArg(OPT_dll)) { StringRef s = (config->machine == I386) ? "__DllMainCRTStartup@12" @@ -2474,11 +2371,12 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { config->entry = ctx.symtab.addGCRoot(s, true); } else if (config->driverWdm) { // /driver:wdm implies /entry:_NtProcessStartup - config->entry = ctx.symtab.addGCRoot(mangle("_NtProcessStartup"), true); + config->entry = + ctx.symtab.addGCRoot(ctx.symtab.mangle("_NtProcessStartup"), true); } else { // Windows specific -- If entry point name is not given, we need to // infer that from user-defined entry name. - StringRef s = findDefaultEntry(); + StringRef s = ctx.symtab.findDefaultEntry(); if (s.empty()) Fatal(ctx) << "entry point must be defined"; config->entry = ctx.symtab.addGCRoot(s, true); @@ -2568,24 +2466,24 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { config->imageBase = getDefaultImageBase(); ctx.forEachSymtab([&](SymbolTable &symtab) { - symtab.addSynthetic(mangle("__ImageBase"), nullptr); + symtab.addSynthetic(symtab.mangle("__ImageBase"), nullptr); if (symtab.machine == I386) { symtab.addAbsolute("___safe_se_handler_table", 0); symtab.addAbsolute("___safe_se_handler_count", 0); } - symtab.addAbsolute(mangle("__guard_fids_count"), 0); - symtab.addAbsolute(mangle("__guard_fids_table"), 0); - symtab.addAbsolute(mangle("__guard_flags"), 0); - symtab.addAbsolute(mangle("__guard_iat_count"), 0); - symtab.addAbsolute(mangle("__guard_iat_table"), 0); - symtab.addAbsolute(mangle("__guard_longjmp_count"), 0); - symtab.addAbsolute(mangle("__guard_longjmp_table"), 0); + symtab.addAbsolute(symtab.mangle("__guard_fids_count"), 0); + symtab.addAbsolute(symtab.mangle("__guard_fids_table"), 0); + symtab.addAbsolute(symtab.mangle("__guard_flags"), 0); + symtab.addAbsolute(symtab.mangle("__guard_iat_count"), 0); + symtab.addAbsolute(symtab.mangle("__guard_iat_table"), 0); + symtab.addAbsolute(symtab.mangle("__guard_longjmp_count"), 0); + symtab.addAbsolute(symtab.mangle("__guard_longjmp_table"), 0); // Needed for MSVC 2017 15.5 CRT. - symtab.addAbsolute(mangle("__enclave_config"), 0); + symtab.addAbsolute(symtab.mangle("__enclave_config"), 0); // Needed for MSVC 2019 16.8 CRT. - symtab.addAbsolute(mangle("__guard_eh_cont_count"), 0); - symtab.addAbsolute(mangle("__guard_eh_cont_table"), 0); + symtab.addAbsolute(symtab.mangle("__guard_eh_cont_count"), 0); + symtab.addAbsolute(symtab.mangle("__guard_eh_cont_table"), 0); if (symtab.isEC()) { symtab.addAbsolute("__arm64x_extra_rfe_table", 0); @@ -2606,16 +2504,16 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { } if (config->pseudoRelocs) { - symtab.addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST__"), 0); - symtab.addAbsolute(mangle("__RUNTIME_PSEUDO_RELOC_LIST_END__"), 0); + symtab.addAbsolute(symtab.mangle("__RUNTIME_PSEUDO_RELOC_LIST__"), 0); + symtab.addAbsolute(symtab.mangle("__RUNTIME_PSEUDO_RELOC_LIST_END__"), 0); } if (config->mingw) { - symtab.addAbsolute(mangle("__CTOR_LIST__"), 0); - symtab.addAbsolute(mangle("__DTOR_LIST__"), 0); + symtab.addAbsolute(symtab.mangle("__CTOR_LIST__"), 0); + symtab.addAbsolute(symtab.mangle("__DTOR_LIST__"), 0); } if (config->debug || config->buildIDHash != BuildIDHash::None) if (symtab.findUnderscore("__buildid")) - symtab.addUndefined(mangle("__buildid")); + symtab.addUndefined(symtab.mangle("__buildid")); }); // This code may add new undefined symbols to the link, which may enqueue more @@ -2627,7 +2525,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // Windows specific -- if entry point is not found, // search for its mangled names. if (config->entry) - mangleMaybe(config->entry); + ctx.symtab.mangleMaybe(config->entry); // Windows specific -- Make sure we resolve all dllexported symbols. for (Export &e : config->exports) { @@ -2635,7 +2533,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { continue; e.sym = ctx.symtab.addGCRoot(e.name, !e.data); if (e.source != ExportSource::Directives) - e.symbolName = mangleMaybe(e.sym); + e.symbolName = ctx.symtab.mangleMaybe(e.sym); } // Add weak aliases. Weak aliases is a mechanism to give remaining @@ -2675,7 +2573,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // Windows specific -- if __load_config_used can be resolved, resolve it. if (ctx.symtab.findUnderscore("_load_config_used")) - ctx.symtab.addGCRoot(mangle("_load_config_used")); + ctx.symtab.addGCRoot(ctx.symtab.mangle("_load_config_used")); if (args.hasArg(OPT_include_optional)) { // Handle /includeoptional @@ -2688,7 +2586,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // Handle /includeglob for (StringRef pat : args::getStrings(args, OPT_incl_glob)) - addUndefinedGlob(pat); + ctx.symtab.addUndefinedGlob(pat); // Create wrapped symbols for -wrap option. std::vector wrapped = addWrappedSymbols(ctx, args); diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h index 9d4f1cbfcb584..4558f68c041fa 100644 --- a/lld/COFF/Driver.h +++ b/lld/COFF/Driver.h @@ -106,8 +106,6 @@ class LinkerDriver { StringRef findLib(StringRef filename); StringRef findLibMinGW(StringRef filename); - bool findUnderscoreMangle(StringRef sym); - // Determines the location of the sysroot based on `args`, environment, etc. void detectWinSysRoot(const llvm::opt::InputArgList &args); @@ -115,9 +113,6 @@ class LinkerDriver { // config.machine has been set. void addWinSysRootLibSearchPaths(); - // Symbol names are mangled by prepending "_" on x86. - StringRef mangle(StringRef sym); - void setMachine(llvm::COFF::MachineTypes machine); llvm::Triple::ArchType getArch(); @@ -173,20 +168,6 @@ class LinkerDriver { std::set visitedLibs; - void addUndefinedGlob(StringRef arg); - - StringRef mangleMaybe(Symbol *s); - - // Windows specific -- "main" is not the only main function in Windows. - // You can choose one from these four -- {w,}{WinMain,main}. - // There are four different entry point functions for them, - // {w,}{WinMain,main}CRTStartup, respectively. The linker needs to - // choose the right one depending on which "main" function is defined. - // This function looks up the symbol table and resolve corresponding - // entry point name. - StringRef findDefaultEntry(); - WindowsSubsystem inferSubsystem(); - void addBuffer(std::unique_ptr mb, bool wholeArchive, bool lazy); void addArchiveBuffer(MemoryBufferRef mbref, StringRef symName, diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index b2f3ffe780e5d..7c43ada3d136e 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -21,12 +21,14 @@ #include "llvm/IR/Mangler.h" #include "llvm/LTO/LTO.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/GlobPattern.h" #include "llvm/Support/Parallel.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/raw_ostream.h" #include using namespace llvm; +using namespace llvm::COFF; using namespace llvm::support; namespace lld::coff { @@ -1022,6 +1024,110 @@ Symbol *SymbolTable::findMangle(StringRef name) { return findByPrefix("?" + name.substr(1) + "@@Y"); } +bool SymbolTable::findUnderscoreMangle(StringRef sym) { + Symbol *s = findMangle(mangle(sym)); + return s && !isa(s); +} + +// Symbol names are mangled by prepending "_" on x86. +StringRef SymbolTable::mangle(StringRef sym) { + assert(machine != IMAGE_FILE_MACHINE_UNKNOWN); + if (machine == I386) + return saver().save("_" + sym); + return sym; +} + +StringRef SymbolTable::mangleMaybe(Symbol *s) { + // If the plain symbol name has already been resolved, do nothing. + Undefined *unmangled = dyn_cast(s); + if (!unmangled) + return ""; + + // Otherwise, see if a similar, mangled symbol exists in the symbol table. + Symbol *mangled = findMangle(unmangled->getName()); + if (!mangled) + return ""; + + // If we find a similar mangled symbol, make this an alias to it and return + // its name. + Log(ctx) << unmangled->getName() << " aliased to " << mangled->getName(); + unmangled->setWeakAlias(addUndefined(mangled->getName())); + return mangled->getName(); +} + +// Windows specific -- find default entry point name. +// +// There are four different entry point functions for Windows executables, +// each of which corresponds to a user-defined "main" function. This function +// infers an entry point from a user-defined "main" function. +StringRef SymbolTable::findDefaultEntry() { + assert(ctx.config.subsystem != IMAGE_SUBSYSTEM_UNKNOWN && + "must handle /subsystem before calling this"); + + if (ctx.config.mingw) + return mangle(ctx.config.subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI + ? "WinMainCRTStartup" + : "mainCRTStartup"); + + if (ctx.config.subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI) { + if (findUnderscoreMangle("wWinMain")) { + if (!findUnderscoreMangle("WinMain")) + return mangle("wWinMainCRTStartup"); + Warn(ctx) << "found both wWinMain and WinMain; using latter"; + } + return mangle("WinMainCRTStartup"); + } + if (findUnderscoreMangle("wmain")) { + if (!findUnderscoreMangle("main")) + return mangle("wmainCRTStartup"); + Warn(ctx) << "found both wmain and main; using latter"; + } + return mangle("mainCRTStartup"); +} + +WindowsSubsystem SymbolTable::inferSubsystem() { + if (ctx.config.dll) + return IMAGE_SUBSYSTEM_WINDOWS_GUI; + if (ctx.config.mingw) + return IMAGE_SUBSYSTEM_WINDOWS_CUI; + // Note that link.exe infers the subsystem from the presence of these + // functions even if /entry: or /nodefaultlib are passed which causes them + // to not be called. + bool haveMain = findUnderscoreMangle("main"); + bool haveWMain = findUnderscoreMangle("wmain"); + bool haveWinMain = findUnderscoreMangle("WinMain"); + bool haveWWinMain = findUnderscoreMangle("wWinMain"); + if (haveMain || haveWMain) { + if (haveWinMain || haveWWinMain) { + Warn(ctx) << "found " << (haveMain ? "main" : "wmain") << " and " + << (haveWinMain ? "WinMain" : "wWinMain") + << "; defaulting to /subsystem:console"; + } + return IMAGE_SUBSYSTEM_WINDOWS_CUI; + } + if (haveWinMain || haveWWinMain) + return IMAGE_SUBSYSTEM_WINDOWS_GUI; + return IMAGE_SUBSYSTEM_UNKNOWN; +} + +void SymbolTable::addUndefinedGlob(StringRef arg) { + Expected pat = GlobPattern::create(arg); + if (!pat) { + Err(ctx) << "/includeglob: " << toString(pat.takeError()); + return; + } + + SmallVector syms; + forEachSymbol([&syms, &pat](Symbol *sym) { + if (pat->match(sym->getName())) { + syms.push_back(sym); + } + }); + + for (Symbol *sym : syms) + addGCRoot(sym->getName()); +} + Symbol *SymbolTable::addUndefined(StringRef name) { return addUndefined(name, nullptr, false); } diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index 4c749ae059d27..1de0b3e1deac3 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -74,11 +74,27 @@ class SymbolTable { Symbol *find(StringRef name) const; Symbol *findUnderscore(StringRef name) const; + void addUndefinedGlob(StringRef arg); + // Occasionally we have to resolve an undefined symbol to its // mangled symbol. This function tries to find a mangled name // for U from the symbol table, and if found, set the symbol as // a weak alias for U. Symbol *findMangle(StringRef name); + StringRef mangleMaybe(Symbol *s); + + // Symbol names are mangled by prepending "_" on x86. + StringRef mangle(StringRef sym); + + // Windows specific -- "main" is not the only main function in Windows. + // You can choose one from these four -- {w,}{WinMain,main}. + // There are four different entry point functions for them, + // {w,}{WinMain,main}CRTStartup, respectively. The linker needs to + // choose the right one depending on which "main" function is defined. + // This function looks up the symbol table and resolve corresponding + // entry point name. + StringRef findDefaultEntry(); + WindowsSubsystem inferSubsystem(); // Build a set of COFF objects representing the combined contents of // BitcodeFiles and add them to the symbol table. Called after all files are @@ -152,6 +168,7 @@ class SymbolTable { /// Same as insert(Name), but also sets isUsedInRegularObj. std::pair insert(StringRef name, InputFile *f); + bool findUnderscoreMangle(StringRef sym); std::vector getSymsWithPrefix(StringRef prefix); llvm::DenseMap symMap; diff --git a/lldb/include/lldb/Core/Disassembler.h b/lldb/include/lldb/Core/Disassembler.h index e0ad4316e0249..21bacb14f9b25 100644 --- a/lldb/include/lldb/Core/Disassembler.h +++ b/lldb/include/lldb/Core/Disassembler.h @@ -428,7 +428,7 @@ class Disassembler : public std::enable_shared_from_this, static lldb::DisassemblerSP DisassembleRange(const ArchSpec &arch, const char *plugin_name, const char *flavor, const char *cpu, const char *features, - Target &target, const AddressRange &disasm_range, + Target &target, llvm::ArrayRef disasm_ranges, bool force_live_memory = false); static lldb::DisassemblerSP @@ -460,7 +460,11 @@ class Disassembler : public std::enable_shared_from_this, size_t ParseInstructions(Target &target, Address address, Limit limit, Stream *error_strm_ptr, - bool force_live_memory = false); + bool force_live_memory = false) { + m_instruction_list.Clear(); + return AppendInstructions(target, address, limit, error_strm_ptr, + force_live_memory); + } virtual size_t DecodeInstructions(const Address &base_addr, const DataExtractor &data, @@ -480,6 +484,9 @@ class Disassembler : public std::enable_shared_from_this, const char *flavor) = 0; protected: + size_t AppendInstructions(Target &target, Address address, Limit limit, + Stream *error_strm_ptr, bool force_live_memory); + // SourceLine and SourceLinesToDisplay structures are only used in the mixed // source and assembly display methods internal to this class. diff --git a/lldb/include/lldb/Target/SectionLoadHistory.h b/lldb/include/lldb/Target/SectionLoadHistory.h index 64bb828d4254a..4380d6f2cf121 100644 --- a/lldb/include/lldb/Target/SectionLoadHistory.h +++ b/lldb/include/lldb/Target/SectionLoadHistory.h @@ -45,7 +45,7 @@ class SectionLoadHistory { const lldb::SectionSP §ion_sp); bool ResolveLoadAddress(uint32_t stop_id, lldb::addr_t load_addr, - Address &so_addr); + Address &so_addr, bool allow_section_end = false); bool SetSectionLoadAddress(uint32_t stop_id, const lldb::SectionSP §ion_sp, diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index 0d1943450d622..f31ac381391b4 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -1151,9 +1151,13 @@ class Target : public std::enable_shared_from_this, Address &pointer_addr, bool force_live_memory = false); - SectionLoadList &GetSectionLoadList() { - return m_section_load_history.GetCurrentSectionLoadList(); - } + bool HasLoadedSections(); + + lldb::addr_t GetSectionLoadAddress(const lldb::SectionSP §ion_sp); + + void ClearSectionLoadList(); + + void DumpSectionLoadList(Stream &s); static Target *GetTargetFromContexts(const ExecutionContext *exe_ctx_ptr, const SymbolContext *sc_ptr); @@ -1218,7 +1222,8 @@ class Target : public std::enable_shared_from_this, bool ResolveFileAddress(lldb::addr_t load_addr, Address &so_addr); bool ResolveLoadAddress(lldb::addr_t load_addr, Address &so_addr, - uint32_t stop_id = SectionLoadHistory::eStopIDNow); + uint32_t stop_id = SectionLoadHistory::eStopIDNow, + bool allow_section_end = false); bool SetSectionLoadAddress(const lldb::SectionSP §ion, lldb::addr_t load_addr, @@ -1666,6 +1671,10 @@ class Target : public std::enable_shared_from_this, Target(const Target &) = delete; const Target &operator=(const Target &) = delete; + + SectionLoadList &GetSectionLoadList() { + return m_section_load_history.GetCurrentSectionLoadList(); + } }; } // namespace lldb_private diff --git a/lldb/source/API/SBBreakpoint.cpp b/lldb/source/API/SBBreakpoint.cpp index b2ed034d19983..87fadbcec4f26 100644 --- a/lldb/source/API/SBBreakpoint.cpp +++ b/lldb/source/API/SBBreakpoint.cpp @@ -137,7 +137,7 @@ SBBreakpointLocation SBBreakpoint::FindLocationByAddress(addr_t vm_addr) { bkpt_sp->GetTarget().GetAPIMutex()); Address address; Target &target = bkpt_sp->GetTarget(); - if (!target.GetSectionLoadList().ResolveLoadAddress(vm_addr, address)) { + if (!target.ResolveLoadAddress(vm_addr, address)) { address.SetRawAddress(vm_addr); } sb_bp_location.SetLocation(bkpt_sp->FindLocationByAddress(address)); @@ -157,7 +157,7 @@ break_id_t SBBreakpoint::FindLocationIDByAddress(addr_t vm_addr) { bkpt_sp->GetTarget().GetAPIMutex()); Address address; Target &target = bkpt_sp->GetTarget(); - if (!target.GetSectionLoadList().ResolveLoadAddress(vm_addr, address)) { + if (!target.ResolveLoadAddress(vm_addr, address)) { address.SetRawAddress(vm_addr); } break_id = bkpt_sp->FindLocationIDByAddress(address); diff --git a/lldb/source/API/SBFunction.cpp b/lldb/source/API/SBFunction.cpp index 414eccc357c0e..d07594c2e8c01 100644 --- a/lldb/source/API/SBFunction.cpp +++ b/lldb/source/API/SBFunction.cpp @@ -127,7 +127,7 @@ SBInstructionList SBFunction::GetInstructions(SBTarget target, sb_instructions.SetDisassembler(Disassembler::DisassembleRange( module_sp->GetArchitecture(), nullptr, flavor, target_sp->GetDisassemblyCPU(), target_sp->GetDisassemblyFeatures(), - *target_sp, m_opaque_ptr->GetAddressRange(), force_live_memory)); + *target_sp, m_opaque_ptr->GetAddressRanges(), force_live_memory)); } } return sb_instructions; diff --git a/lldb/source/API/SBInstructionList.cpp b/lldb/source/API/SBInstructionList.cpp index 3f37b984cb462..c18204375dff1 100644 --- a/lldb/source/API/SBInstructionList.cpp +++ b/lldb/source/API/SBInstructionList.cpp @@ -151,6 +151,10 @@ bool SBInstructionList::GetDescription(Stream &sref) { FormatEntity::Parse("${addr}: ", format); SymbolContext sc; SymbolContext prev_sc; + + // Expected address of the next instruction. Used to print an empty line + // for non-contiguous blocks of insns. + std::optional
next_addr; for (size_t i = 0; i < num_instructions; ++i) { Instruction *inst = m_opaque_sp->GetInstructionList().GetInstructionAtIndex(i).get(); @@ -165,10 +169,14 @@ bool SBInstructionList::GetDescription(Stream &sref) { addr, eSymbolContextEverything, sc); } + if (next_addr && *next_addr != addr) + sref.EOL(); inst->Dump(&sref, max_opcode_byte_size, true, false, /*show_control_flow_kind=*/false, nullptr, &sc, &prev_sc, &format, 0); sref.EOL(); + next_addr = addr; + next_addr->Slide(inst->GetOpcode().GetByteSize()); } return true; } diff --git a/lldb/source/Breakpoint/BreakpointLocationList.cpp b/lldb/source/Breakpoint/BreakpointLocationList.cpp index e0f1b9b2c8088..0240305d6f292 100644 --- a/lldb/source/Breakpoint/BreakpointLocationList.cpp +++ b/lldb/source/Breakpoint/BreakpointLocationList.cpp @@ -103,8 +103,7 @@ BreakpointLocationList::FindByAddress(const Address &addr) const { so_addr = addr; } else { // Try and resolve as a load address if possible. - m_owner.GetTarget().GetSectionLoadList().ResolveLoadAddress( - addr.GetOffset(), so_addr); + m_owner.GetTarget().ResolveLoadAddress(addr.GetOffset(), so_addr); if (!so_addr.IsValid()) { // The address didn't resolve, so just set to passed in addr. so_addr = addr; diff --git a/lldb/source/Commands/CommandObjectDisassemble.cpp b/lldb/source/Commands/CommandObjectDisassemble.cpp index 6db4b2665bd84..5b131fe86dedb 100644 --- a/lldb/source/Commands/CommandObjectDisassemble.cpp +++ b/lldb/source/Commands/CommandObjectDisassemble.cpp @@ -269,10 +269,10 @@ CommandObjectDisassemble::GetContainingAddressRanges() { }; Target &target = GetTarget(); - if (!target.GetSectionLoadList().IsEmpty()) { + if (target.HasLoadedSections()) { Address symbol_containing_address; - if (target.GetSectionLoadList().ResolveLoadAddress( - m_options.symbol_containing_addr, symbol_containing_address)) { + if (target.ResolveLoadAddress(m_options.symbol_containing_addr, + symbol_containing_address)) { get_range(symbol_containing_address); } } else { diff --git a/lldb/source/Commands/CommandObjectRegister.cpp b/lldb/source/Commands/CommandObjectRegister.cpp index 4e047ccbc10b9..fbb92e5c63877 100644 --- a/lldb/source/Commands/CommandObjectRegister.cpp +++ b/lldb/source/Commands/CommandObjectRegister.cpp @@ -95,8 +95,8 @@ class CommandObjectRegisterRead : public CommandObjectParsed { addr_t reg_addr = reg_value.GetAsUInt64(LLDB_INVALID_ADDRESS); if (reg_addr != LLDB_INVALID_ADDRESS) { Address so_reg_addr; - if (exe_ctx.GetTargetRef().GetSectionLoadList().ResolveLoadAddress( - reg_addr, so_reg_addr)) { + if (exe_ctx.GetTargetRef().ResolveLoadAddress(reg_addr, + so_reg_addr)) { strm.PutCString(" "); so_reg_addr.Dump(&strm, exe_ctx.GetBestExecutionContextScope(), Address::DumpStyleResolvedDescription); diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp index c8295fd10cf22..936783216f6ff 100644 --- a/lldb/source/Commands/CommandObjectSource.cpp +++ b/lldb/source/Commands/CommandObjectSource.cpp @@ -302,7 +302,7 @@ class CommandObjectSourceInfo : public CommandObjectParsed { size_t num_matches = 0; assert(module_list.GetSize() > 0); Target &target = GetTarget(); - if (target.GetSectionLoadList().IsEmpty()) { + if (!target.HasLoadedSections()) { // The target isn't loaded yet, we need to lookup the file address in all // modules. Note: the module list option does not apply to addresses. const size_t num_modules = module_list.GetSize(); @@ -328,7 +328,7 @@ class CommandObjectSourceInfo : public CommandObjectParsed { } else { // The target has some things loaded, resolve this address to a compile // unit + file + line and display - if (target.GetSectionLoadList().ResolveLoadAddress(addr, so_addr)) { + if (target.ResolveLoadAddress(addr, so_addr)) { ModuleSP module_sp(so_addr.GetModule()); // Check to make sure this module is in our list. if (module_sp && module_list.GetIndexForModule(module_sp.get()) != @@ -959,7 +959,7 @@ class CommandObjectSourceList : public CommandObjectParsed { StreamString error_strm; SymbolContextList sc_list; - if (target.GetSectionLoadList().IsEmpty()) { + if (!target.HasLoadedSections()) { // The target isn't loaded yet, we need to lookup the file address in // all modules const ModuleList &module_list = target.GetImages(); @@ -987,8 +987,7 @@ class CommandObjectSourceList : public CommandObjectParsed { } else { // The target has some things loaded, resolve this address to a compile // unit + file + line and display - if (target.GetSectionLoadList().ResolveLoadAddress(m_options.address, - so_addr)) { + if (target.ResolveLoadAddress(m_options.address, so_addr)) { ModuleSP module_sp(so_addr.GetModule()); if (module_sp) { SymbolContext sc; diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 307f4f683e3b2..d8265e41a7384 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -1522,8 +1522,8 @@ static bool LookupAddressInModule(CommandInterpreter &interpreter, Stream &strm, Address so_addr; SymbolContext sc; Target *target = interpreter.GetExecutionContext().GetTargetPtr(); - if (target && !target->GetSectionLoadList().IsEmpty()) { - if (!target->GetSectionLoadList().ResolveLoadAddress(addr, so_addr)) + if (target && target->HasLoadedSections()) { + if (!target->ResolveLoadAddress(addr, so_addr)) return false; else if (so_addr.GetModule().get() != module) return false; @@ -2974,8 +2974,8 @@ class CommandObjectTargetModulesLoad sect_name); break; } else { - if (target.GetSectionLoadList().SetSectionLoadAddress( - section_sp, load_addr)) + if (target.SetSectionLoadAddress(section_sp, + load_addr)) changed = true; result.AppendMessageWithFormat( "section '%s' loaded at 0x%" PRIx64 "\n", @@ -3329,7 +3329,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed { if (objfile) { Address base_addr(objfile->GetBaseAddress()); if (base_addr.IsValid()) { - if (!target.GetSectionLoadList().IsEmpty()) { + if (target.HasLoadedSections()) { lldb::addr_t load_addr = base_addr.GetLoadAddress(&target); if (load_addr == LLDB_INVALID_ADDRESS) { base_addr.Dump(&strm, &target, @@ -3544,8 +3544,7 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { function_options, sc_list); } else if (m_options.m_type == eLookupTypeAddress && target) { Address addr; - if (target->GetSectionLoadList().ResolveLoadAddress(m_options.m_addr, - addr)) { + if (target->ResolveLoadAddress(m_options.m_addr, addr)) { SymbolContext sc; ModuleSP module_sp(addr.GetModule()); module_sp->ResolveSymbolContextForAddress(addr, @@ -5270,7 +5269,7 @@ class CommandObjectTargetDumpSectionLoadList : public CommandObjectParsed { protected: void DoExecute(Args &command, CommandReturnObject &result) override { Target &target = GetTarget(); - target.GetSectionLoadList().Dump(result.GetOutputStream(), &target); + target.DumpSectionLoadList(result.GetOutputStream()); result.SetStatus(eReturnStatusSuccessFinishResult); } }; diff --git a/lldb/source/Core/Address.cpp b/lldb/source/Core/Address.cpp index 5a4751bd5256e..1dab874a96583 100644 --- a/lldb/source/Core/Address.cpp +++ b/lldb/source/Core/Address.cpp @@ -138,9 +138,8 @@ static bool ReadAddress(ExecutionContextScope *exe_scope, // If we have any sections that are loaded, try and resolve using the // section load list Target *target = exe_ctx.GetTargetPtr(); - if (target && !target->GetSectionLoadList().IsEmpty()) { - if (target->GetSectionLoadList().ResolveLoadAddress(deref_addr, - deref_so_addr)) + if (target && target->HasLoadedSections()) { + if (target->ResolveLoadAddress(deref_addr, deref_so_addr)) return true; } else { // If we were not running, yet able to read an integer, we must have a @@ -1046,8 +1045,9 @@ AddressClass Address::GetAddressClass() const { bool Address::SetLoadAddress(lldb::addr_t load_addr, Target *target, bool allow_section_end) { - if (target && target->GetSectionLoadList().ResolveLoadAddress( - load_addr, *this, allow_section_end)) + if (target && target->ResolveLoadAddress(load_addr, *this, + SectionLoadHistory::eStopIDNow, + allow_section_end)) return true; m_section_wp.reset(); m_offset = load_addr; diff --git a/lldb/source/Core/Disassembler.cpp b/lldb/source/Core/Disassembler.cpp index 68e52144eb6ef..b05be7e1a46d7 100644 --- a/lldb/source/Core/Disassembler.cpp +++ b/lldb/source/Core/Disassembler.cpp @@ -107,11 +107,11 @@ static Address ResolveAddress(Target &target, const Address &addr) { Address resolved_addr; // If we weren't passed in a section offset address range, try and resolve // it to something - bool is_resolved = target.GetSectionLoadList().IsEmpty() - ? target.GetImages().ResolveFileAddress( - addr.GetOffset(), resolved_addr) - : target.GetSectionLoadList().ResolveLoadAddress( - addr.GetOffset(), resolved_addr); + bool is_resolved = + target.HasLoadedSections() + ? target.ResolveLoadAddress(addr.GetOffset(), resolved_addr) + : target.GetImages().ResolveFileAddress(addr.GetOffset(), + resolved_addr); // We weren't able to resolve the address, just treat it as a raw address if (is_resolved && resolved_addr.IsValid()) @@ -123,22 +123,19 @@ static Address ResolveAddress(Target &target, const Address &addr) { lldb::DisassemblerSP Disassembler::DisassembleRange( const ArchSpec &arch, const char *plugin_name, const char *flavor, const char *cpu, const char *features, Target &target, - const AddressRange &range, bool force_live_memory) { - if (range.GetByteSize() <= 0) - return {}; - - if (!range.GetBaseAddress().IsValid()) - return {}; - + llvm::ArrayRef disasm_ranges, bool force_live_memory) { lldb::DisassemblerSP disasm_sp = Disassembler::FindPluginForTarget( target, arch, flavor, cpu, features, plugin_name); if (!disasm_sp) return {}; - const size_t bytes_disassembled = disasm_sp->ParseInstructions( - target, range.GetBaseAddress(), {Limit::Bytes, range.GetByteSize()}, - nullptr, force_live_memory); + size_t bytes_disassembled = 0; + for (const AddressRange &range : disasm_ranges) { + bytes_disassembled += disasm_sp->AppendInstructions( + target, range.GetBaseAddress(), {Limit::Bytes, range.GetByteSize()}, + nullptr, force_live_memory); + } if (bytes_disassembled == 0) return {}; @@ -1092,11 +1089,9 @@ InstructionList::GetIndexOfInstructionAtLoadAddress(lldb::addr_t load_addr, return GetIndexOfInstructionAtAddress(address); } -size_t Disassembler::ParseInstructions(Target &target, Address start, - Limit limit, Stream *error_strm_ptr, - bool force_live_memory) { - m_instruction_list.Clear(); - +size_t Disassembler::AppendInstructions(Target &target, Address start, + Limit limit, Stream *error_strm_ptr, + bool force_live_memory) { if (!start.IsValid()) return 0; @@ -1129,7 +1124,7 @@ size_t Disassembler::ParseInstructions(Target &target, Address start, return DecodeInstructions(start, data, 0, limit.kind == Limit::Instructions ? limit.value : UINT32_MAX, - false, data_from_file); + /*append=*/true, data_from_file); } // Disassembler copy constructor diff --git a/lldb/source/Core/DumpDataExtractor.cpp b/lldb/source/Core/DumpDataExtractor.cpp index 565ee3a0ae40a..72140736d8877 100644 --- a/lldb/source/Core/DumpDataExtractor.cpp +++ b/lldb/source/Core/DumpDataExtractor.cpp @@ -136,10 +136,10 @@ static lldb::offset_t DumpInstructions(const DataExtractor &DE, Stream *s, lldb::addr_t addr = base_addr + start_offset; lldb_private::Address so_addr; bool data_from_file = true; - if (target_sp->GetSectionLoadList().ResolveLoadAddress(addr, so_addr)) { + if (target_sp->ResolveLoadAddress(addr, so_addr)) { data_from_file = false; } else { - if (target_sp->GetSectionLoadList().IsEmpty() || + if (!target_sp->HasLoadedSections() || !target_sp->GetImages().ResolveFileAddress(addr, so_addr)) so_addr.SetRawAddress(addr); } @@ -707,8 +707,7 @@ lldb::offset_t lldb_private::DumpDataExtractor( TargetSP target_sp(exe_scope->CalculateTarget()); lldb_private::Address so_addr; if (target_sp) { - if (target_sp->GetSectionLoadList().ResolveLoadAddress(addr, - so_addr)) { + if (target_sp->ResolveLoadAddress(addr, so_addr)) { s->PutChar(' '); so_addr.Dump(s, exe_scope, Address::DumpStyleResolvedDescription, Address::DumpStyleModuleWithFileAddress); @@ -719,8 +718,7 @@ lldb::offset_t lldb_private::DumpDataExtractor( if (ProcessSP process_sp = exe_scope->CalculateProcess()) { if (ABISP abi_sp = process_sp->GetABI()) { addr_t addr_fixed = abi_sp->FixCodeAddress(addr); - if (target_sp->GetSectionLoadList().ResolveLoadAddress( - addr_fixed, so_addr)) { + if (target_sp->ResolveLoadAddress(addr_fixed, so_addr)) { s->PutChar(' '); s->Printf("(0x%*.*" PRIx64 ")", (int)(2 * item_byte_size), (int)(2 * item_byte_size), addr_fixed); diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp index d76fc97caa013..e13284832cf57 100644 --- a/lldb/source/Core/FormatEntity.cpp +++ b/lldb/source/Core/FormatEntity.cpp @@ -412,7 +412,7 @@ static bool DumpAddressAndContent(Stream &s, const SymbolContext *sc, Target *target = Target::GetTargetFromContexts(exe_ctx, sc); addr_t vaddr = LLDB_INVALID_ADDRESS; - if (target && !target->GetSectionLoadList().IsEmpty()) + if (target && target->HasLoadedSections()) vaddr = addr.GetLoadAddress(target); if (vaddr == LLDB_INVALID_ADDRESS) vaddr = addr.GetFileAddress(); diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp index 31273ede618f2..a17f43fe89033 100644 --- a/lldb/source/Core/Section.cpp +++ b/lldb/source/Core/Section.cpp @@ -238,7 +238,7 @@ addr_t Section::GetLoadBaseAddress(Target *target) const { load_base_addr += GetOffset(); } if (load_base_addr == LLDB_INVALID_ADDRESS) { - load_base_addr = target->GetSectionLoadList().GetSectionLoadAddress( + load_base_addr = target->GetSectionLoadAddress( const_cast
(this)->shared_from_this()); } return load_base_addr; @@ -643,8 +643,7 @@ bool SectionList::ContainsSection(user_id_t sect_id) const { void SectionList::Dump(llvm::raw_ostream &s, unsigned indent, Target *target, bool show_header, uint32_t depth) const { - bool target_has_loaded_sections = - target && !target->GetSectionLoadList().IsEmpty(); + bool target_has_loaded_sections = target && target->HasLoadedSections(); if (show_header && !m_sections.empty()) { s.indent(indent); s << llvm::formatv( diff --git a/lldb/source/Core/Value.cpp b/lldb/source/Core/Value.cpp index bd93c04c16d24..70299cb8455a1 100644 --- a/lldb/source/Core/Value.cpp +++ b/lldb/source/Core/Value.cpp @@ -364,10 +364,9 @@ Status Value::GetValueAsData(ExecutionContext *exe_ctx, DataExtractor &data, // memory sections loaded. This allows you to use "target modules // load" to load your executable and any shared libraries, then // execute commands where you can look at types in data sections. - const SectionLoadList &target_sections = target->GetSectionLoadList(); - if (!target_sections.IsEmpty()) { + if (target->HasLoadedSections()) { address = m_value.ULongLong(LLDB_INVALID_ADDRESS); - if (target_sections.ResolveLoadAddress(address, file_so_addr)) { + if (target->ResolveLoadAddress(address, file_so_addr)) { address_type = eAddressTypeLoad; data.SetByteOrder(target->GetArchitecture().GetByteOrder()); data.SetAddressByteSize( diff --git a/lldb/source/DataFormatters/CXXFunctionPointer.cpp b/lldb/source/DataFormatters/CXXFunctionPointer.cpp index 6d56e39fa9733..e17659ad0f2f0 100644 --- a/lldb/source/DataFormatters/CXXFunctionPointer.cpp +++ b/lldb/source/DataFormatters/CXXFunctionPointer.cpp @@ -39,9 +39,8 @@ bool lldb_private::formatters::CXXFunctionPointerSummaryProvider( Address so_addr; Target *target = exe_ctx.GetTargetPtr(); - if (target && !target->GetSectionLoadList().IsEmpty()) { - target->GetSectionLoadList().ResolveLoadAddress(func_ptr_address, - so_addr); + if (target && target->HasLoadedSections()) { + target->ResolveLoadAddress(func_ptr_address, so_addr); if (so_addr.GetSection() == nullptr) { // If we have an address that doesn't correspond to any symbol, // it might have authentication bits. Strip them & see if it diff --git a/lldb/source/Expression/ObjectFileJIT.cpp b/lldb/source/Expression/ObjectFileJIT.cpp index 9a839866096bd..e4a613551d22e 100644 --- a/lldb/source/Expression/ObjectFileJIT.cpp +++ b/lldb/source/Expression/ObjectFileJIT.cpp @@ -178,8 +178,8 @@ bool ObjectFileJIT::SetLoadAddress(Target &target, lldb::addr_t value, SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx)); if (section_sp && section_sp->GetFileSize() > 0 && !section_sp->IsThreadSpecific()) { - if (target.GetSectionLoadList().SetSectionLoadAddress( - section_sp, section_sp->GetFileAddress() + value)) + if (target.SetSectionLoadAddress(section_sp, + section_sp->GetFileAddress() + value)) ++num_loaded_sections; } } diff --git a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp index 5aa903443c760..3748be0533ad7 100644 --- a/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp +++ b/lldb/source/Plugins/Architecture/Mips/ArchitectureMips.cpp @@ -76,8 +76,7 @@ lldb::addr_t ArchitectureMips::GetBreakableLoadAddress(lldb::addr_t addr, Address resolved_addr; - SectionLoadList §ion_load_list = target.GetSectionLoadList(); - if (section_load_list.IsEmpty()) + if (!target.HasLoadedSections()) // No sections are loaded, so we must assume we are not running yet and // need to operate only on file address. target.ResolveFileAddress(addr, resolved_addr); diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp index 81c122146764d..76f2db086476f 100644 --- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp +++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp @@ -1787,9 +1787,9 @@ const char *DisassemblerLLVMC::SymbolLookup(uint64_t value, uint64_t *type_ptr, module_sp->ResolveFileAddress(value, value_so_addr); module_sp->ResolveFileAddress(pc, pc_so_addr); } - } else if (target && !target->GetSectionLoadList().IsEmpty()) { - target->GetSectionLoadList().ResolveLoadAddress(value, value_so_addr); - target->GetSectionLoadList().ResolveLoadAddress(pc, pc_so_addr); + } else if (target && target->HasLoadedSections()) { + target->ResolveLoadAddress(value, value_so_addr); + target->ResolveLoadAddress(pc, pc_so_addr); } SymbolContext sym_ctx; diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp index 82555d1e028b4..5b11059bcc50c 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp @@ -368,7 +368,7 @@ bool DynamicLoaderMacOS::NotifyBreakpointHit(void *baton, dyld_instance->UnloadAllImages(); dyld_instance->ClearDYLDModule(); process->GetTarget().GetImages().Clear(); - process->GetTarget().GetSectionLoadList().Clear(); + process->GetTarget().ClearSectionLoadList(); addr_t all_image_infos = process->GetImageInfoAddress(); int addr_size = diff --git a/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp b/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp index e8b92373ef0fa..643c9653f26ec 100644 --- a/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp +++ b/lldb/source/Plugins/DynamicLoader/Static/DynamicLoaderStatic.cpp @@ -103,8 +103,8 @@ void DynamicLoaderStatic::LoadAllImagesAtFileAddresses() { for (size_t sect_idx = 0; sect_idx < num_sections; ++sect_idx) { SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx)); if (section_sp) { - if (target.GetSectionLoadList().GetSectionLoadAddress( - section_sp) != LLDB_INVALID_ADDRESS) { + if (target.GetSectionLoadAddress(section_sp) != + LLDB_INVALID_ADDRESS) { no_load_addresses = false; break; } diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h index d5c68a436e090..7403b79be6cc0 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ASTUtils.h @@ -71,8 +71,9 @@ class ExternalASTSourceWrapper : public clang::ExternalSemaSource { } bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override { - return m_Source->FindExternalVisibleDeclsByName(DC, Name); + clang::DeclarationName Name, + clang::Module *NamedModule) override { + return m_Source->FindExternalVisibleDeclsByName(DC, Name, NamedModule); } bool LoadExternalSpecializations(const clang::Decl *D, @@ -388,9 +389,10 @@ class SemaSourceWithPriorities : public clang::ExternalSemaSource { } bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override { + clang::DeclarationName Name, + clang::Module *NamedModule) override { for (size_t i = 0; i < Sources.size(); ++i) - if (Sources[i]->FindExternalVisibleDeclsByName(DC, Name)) + if (Sources[i]->FindExternalVisibleDeclsByName(DC, Name, NamedModule)) return true; return false; } diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp index e41efdd3f61c7..94ce867ef4a0f 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp @@ -99,7 +99,8 @@ void ClangASTSource::StartTranslationUnit(ASTConsumer *Consumer) { // The core lookup interface. bool ClangASTSource::FindExternalVisibleDeclsByName( - const DeclContext *decl_ctx, DeclarationName clang_decl_name) { + const DeclContext *decl_ctx, DeclarationName clang_decl_name, + clang::Module *NamedModule) { if (!m_ast_context) { SetNoExternalVisibleDeclsForName(decl_ctx, clang_decl_name); return false; diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h index 83c910477acc8..6dc4ecc94e0ed 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h @@ -84,7 +84,8 @@ class ClangASTSource : public clang::ExternalASTSource, /// \return /// Whatever SetExternalVisibleDeclsForName returns. bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override; + clang::DeclarationName Name, + clang::Module *NamedModule) override; /// Enumerate all Decls in a given lexical context. /// @@ -212,8 +213,9 @@ class ClangASTSource : public clang::ExternalASTSource, ClangASTSourceProxy(ClangASTSource &original) : m_original(original) {} bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override { - return m_original.FindExternalVisibleDeclsByName(DC, Name); + clang::DeclarationName Name, + clang::Module *NamedModule) override { + return m_original.FindExternalVisibleDeclsByName(DC, Name, NamedModule); } void FindExternalLexicalDecls( diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp index e746e6afe39be..bf4537e69eb63 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.cpp @@ -50,7 +50,8 @@ void ClangExternalASTSourceCallbacks::FindExternalLexicalDecls( } bool ClangExternalASTSourceCallbacks::FindExternalVisibleDeclsByName( - const clang::DeclContext *DC, clang::DeclarationName Name) { + const clang::DeclContext *DC, clang::DeclarationName Name, + clang::Module *NamedModule) { llvm::SmallVector decls; // Objective-C methods are not added into the LookupPtr when they originate // from an external source. SetExternalVisibleDeclsForName() adds them. diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h index 6bd18186a567d..d2e9c1552fd38 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExternalASTSourceCallbacks.h @@ -38,7 +38,8 @@ class ClangExternalASTSourceCallbacks : public clang::ExternalASTSource { llvm::SmallVectorImpl &Result) override; bool FindExternalVisibleDeclsByName(const clang::DeclContext *DC, - clang::DeclarationName Name) override; + clang::DeclarationName Name, + clang::Module *NamedModule) override; void CompleteType(clang::TagDecl *tag_decl) override; diff --git a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp index 6d3e5b7e5573c..70e36801c3fd7 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp @@ -546,8 +546,7 @@ static std::string Sprintf(const char *format, ...) { static std::string GetSymbolNameFromAddress(ProcessSP process_sp, addr_t addr) { lldb_private::Address so_addr; - if (!process_sp->GetTarget().GetSectionLoadList().ResolveLoadAddress(addr, - so_addr)) + if (!process_sp->GetTarget().ResolveLoadAddress(addr, so_addr)) return ""; lldb_private::Symbol *symbol = so_addr.CalculateSymbolContextSymbol(); @@ -561,8 +560,7 @@ static std::string GetSymbolNameFromAddress(ProcessSP process_sp, addr_t addr) { static void GetSymbolDeclarationFromAddress(ProcessSP process_sp, addr_t addr, Declaration &decl) { lldb_private::Address so_addr; - if (!process_sp->GetTarget().GetSectionLoadList().ResolveLoadAddress(addr, - so_addr)) + if (!process_sp->GetTarget().ResolveLoadAddress(addr, so_addr)) return; lldb_private::Symbol *symbol = so_addr.CalculateSymbolContextSymbol(); @@ -600,8 +598,7 @@ addr_t InstrumentationRuntimeTSan::GetFirstNonInternalFramePc( addr_t addr = *maybe_addr; lldb_private::Address so_addr; - if (!process_sp->GetTarget().GetSectionLoadList().ResolveLoadAddress( - addr, so_addr)) + if (!process_sp->GetTarget().ResolveLoadAddress(addr, so_addr)) continue; if (so_addr.GetModule() == runtime_module_sp) diff --git a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp index 1688fb27430a7..b6487d4e8ed4b 100644 --- a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp +++ b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp @@ -377,7 +377,7 @@ bool JITLoaderGDB::ReadJITDescriptorImpl(bool all_entries) { for (uint32_t i = 0; i < num_sections; ++i) { SectionSP section_sp(section_list->GetSectionAtIndex(i)); if (section_sp) { - target.GetSectionLoadList().SetSectionUnloaded(section_sp); + target.SetSectionUnloaded(section_sp); } } } diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index e7ca3f655f237..fb706544ea560 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -266,21 +266,20 @@ CPPLanguageRuntime::FindLibCppStdFunctionCallableInfo( Target &target = process->GetTarget(); - if (target.GetSectionLoadList().IsEmpty()) + if (!target.HasLoadedSections()) return optional_info; Address vtable_first_entry_resolved; - if (!target.GetSectionLoadList().ResolveLoadAddress( - vtable_address_first_entry, vtable_first_entry_resolved)) + if (!target.ResolveLoadAddress(vtable_address_first_entry, + vtable_first_entry_resolved)) return optional_info; Address vtable_addr_resolved; SymbolContext sc; Symbol *symbol = nullptr; - if (!target.GetSectionLoadList().ResolveLoadAddress(vtable_address, - vtable_addr_resolved)) + if (!target.ResolveLoadAddress(vtable_address, vtable_addr_resolved)) return optional_info; target.GetImages().ResolveSymbolContextForAddress( @@ -322,8 +321,8 @@ CPPLanguageRuntime::FindLibCppStdFunctionCallableInfo( // Setup for cases 2, 4 and 5 we have a pointer to a function after the // vtable. We will use a process of elimination to drop through each case // and obtain the data we need. - if (target.GetSectionLoadList().ResolveLoadAddress( - possible_function_address, function_address_resolved)) { + if (target.ResolveLoadAddress(possible_function_address, + function_address_resolved)) { target.GetImages().ResolveSymbolContextForAddress( function_address_resolved, eSymbolContextEverything, sc); symbol = sc.symbol; @@ -418,15 +417,14 @@ CPPLanguageRuntime::GetStepThroughTrampolinePlan(Thread &thread, TargetSP target_sp(thread.CalculateTarget()); - if (target_sp->GetSectionLoadList().IsEmpty()) + if (!target_sp->HasLoadedSections()) return ret_plan_sp; Address pc_addr_resolved; SymbolContext sc; Symbol *symbol; - if (!target_sp->GetSectionLoadList().ResolveLoadAddress(curr_pc, - pc_addr_resolved)) + if (!target_sp->ResolveLoadAddress(curr_pc, pc_addr_resolved)) return ret_plan_sp; target_sp->GetImages().ResolveSymbolContextForAddress( diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp index 96a259b811b5e..24fc5bb2c047f 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp @@ -30,7 +30,8 @@ class lldb_private::AppleObjCExternalASTSource : m_decl_vendor(decl_vendor) {} bool FindExternalVisibleDeclsByName(const clang::DeclContext *decl_ctx, - clang::DeclarationName name) override { + clang::DeclarationName name, + clang::Module *NamedModule) override { Log *log(GetLog( LLDBLog::Expressions)); // FIXME - a more appropriate log channel? diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index 6452baa4f84af..13e1198516f78 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -762,8 +762,7 @@ bool ObjectFileELF::SetLoadAddress(Target &target, lldb::addr_t value, if (GetAddressByteSize() == 4) load_addr &= 0xFFFFFFFF; - if (target.GetSectionLoadList().SetSectionLoadAddress(section_sp, - load_addr)) + if (target.SetSectionLoadAddress(section_sp, load_addr)) ++num_loaded_sections; } } diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index 488c9bd1e54af..bf2d293d2012c 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -6253,9 +6253,9 @@ bool ObjectFileMachO::SetLoadAddress(Target &target, lldb::addr_t value, "0x%" PRIx64, section_sp->GetName().AsCString(), section_sp->GetFileAddress() + value); - if (target.GetSectionLoadList().SetSectionLoadAddress( - section_sp, section_sp->GetFileAddress() + value, - warn_multiple)) + if (target.SetSectionLoadAddress(section_sp, + section_sp->GetFileAddress() + value, + warn_multiple)) ++num_loaded_sections; } } @@ -6276,8 +6276,8 @@ bool ObjectFileMachO::SetLoadAddress(Target &target, lldb::addr_t value, "ObjectFileMachO::SetLoadAddress segment '%s' load addr is " "0x%" PRIx64, section_sp->GetName().AsCString(), section_load_addr); - if (target.GetSectionLoadList().SetSectionLoadAddress( - section_sp, section_load_addr, warn_multiple)) + if (target.SetSectionLoadAddress(section_sp, section_load_addr, + warn_multiple)) ++num_loaded_sections; } } diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp index bfdb8140e40af..6d92a204b86cc 100644 --- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp +++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp @@ -482,7 +482,7 @@ bool ObjectFilePECOFF::SetLoadAddress(Target &target, addr_t value, // that have SHF_ALLOC in their flag bits. SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx)); if (section_sp && !section_sp->IsThreadSpecific()) { - if (target.GetSectionLoadList().SetSectionLoadAddress( + if (target.SetSectionLoadAddress( section_sp, section_sp->GetFileAddress() + value)) ++num_loaded_sections; } diff --git a/lldb/source/Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.cpp b/lldb/source/Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.cpp index ec1f3f61892d3..e8745d6dd6b83 100644 --- a/lldb/source/Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.cpp +++ b/lldb/source/Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.cpp @@ -59,8 +59,7 @@ bool ObjectFilePlaceholder::SetLoadAddress(Target &target, addr_t value, GetModule()->GetSectionList(); assert(m_sections_up->GetNumSections(0) == 1); - target.GetSectionLoadList().SetSectionLoadAddress( - m_sections_up->GetSectionAtIndex(0), m_base); + target.SetSectionLoadAddress(m_sections_up->GetSectionAtIndex(0), m_base); return true; } diff --git a/lldb/source/Plugins/Process/Utility/CMakeLists.txt b/lldb/source/Plugins/Process/Utility/CMakeLists.txt index 0e1a5069d4409..f269f5d7d4d74 100644 --- a/lldb/source/Plugins/Process/Utility/CMakeLists.txt +++ b/lldb/source/Plugins/Process/Utility/CMakeLists.txt @@ -15,6 +15,7 @@ add_lldb_library(lldbPluginProcessUtility NativeRegisterContextDBReg_x86.cpp NativeRegisterContextRegisterInfo.cpp NetBSDSignals.cpp + OpenBSDSignals.cpp RegisterContext_x86.cpp RegisterContextDarwin_arm.cpp RegisterContextDarwin_arm64.cpp diff --git a/lldb/source/Plugins/Process/Utility/OpenBSDSignals.cpp b/lldb/source/Plugins/Process/Utility/OpenBSDSignals.cpp new file mode 100644 index 0000000000000..48263235126c0 --- /dev/null +++ b/lldb/source/Plugins/Process/Utility/OpenBSDSignals.cpp @@ -0,0 +1,69 @@ +//===-- OpenBSDSignals.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "OpenBSDSignals.h" + +#ifdef __OpenBSD__ +#include + +#define ADD_SIGCODE(signal_name, signal_value, code_name, code_value, ...) \ + static_assert(signal_name == signal_value, \ + "Value mismatch for signal number " #signal_name); \ + static_assert(code_name == code_value, \ + "Value mismatch for signal code " #code_name); \ + AddSignalCode(signal_value, code_value, __VA_ARGS__) +#else +#define ADD_SIGCODE(signal_name, signal_value, code_name, code_value, ...) \ + AddSignalCode(signal_value, code_value, __VA_ARGS__) +#endif /* ifdef __OpenBSD */ + +using namespace lldb_private; + +OpenBSDSignals::OpenBSDSignals() : UnixSignals() { Reset(); } + +void OpenBSDSignals::Reset() { + UnixSignals::Reset(); + + // clang-format off + // SIGILL + ADD_SIGCODE(SIGILL, 4, ILL_ILLOPC, 1, "illegal opcode"); + ADD_SIGCODE(SIGILL, 4, ILL_ILLOPN, 2, "illegal operand"); + ADD_SIGCODE(SIGILL, 4, ILL_ILLADR, 3, "illegal addressing mode"); + ADD_SIGCODE(SIGILL, 4, ILL_ILLTRP, 4, "illegal trap"); + ADD_SIGCODE(SIGILL, 4, ILL_PRVOPC, 5, "privileged opcode"); + ADD_SIGCODE(SIGILL, 4, ILL_PRVREG, 6, "privileged register"); + ADD_SIGCODE(SIGILL, 4, ILL_COPROC, 7, "coprocessor error"); + ADD_SIGCODE(SIGILL, 4, ILL_BADSTK, 8, "internal stack error"); + ADD_SIGCODE(SIGILL, 4, ILL_BTCFI, 9, "IBT missing on indirect call"); + + // SIGFPE + ADD_SIGCODE(SIGFPE, 8, FPE_INTDIV, 1, "integer divide by zero"); + ADD_SIGCODE(SIGFPE, 8, FPE_INTOVF, 2, "integer overflow"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTDIV, 3, "floating point divide by zero"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTOVF, 4, "floating point overflow"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTUND, 5, "floating point underflow"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTRES, 6, "floating point inexact result"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTINV, 7, "invalid floating point operation"); + ADD_SIGCODE(SIGFPE, 8, FPE_FLTSUB, 8, "subscript out of range"); + + // SIGBUS + ADD_SIGCODE(SIGBUS, 10, BUS_ADRALN, 1, "invalid address alignment"); + ADD_SIGCODE(SIGBUS, 10, BUS_ADRERR, 2, "non-existent physical address"); + ADD_SIGCODE(SIGBUS, 10, BUS_OBJERR, 3, "object specific hardware error"); + + // SIGSEGV + ADD_SIGCODE(SIGSEGV, 11, SEGV_MAPERR, 1, "address not mapped to object", + SignalCodePrintOption::Address); + ADD_SIGCODE(SIGSEGV, 11, SEGV_ACCERR, 2, "invalid permissions for mapped object", + SignalCodePrintOption::Address); + + // SIGNO NAME SUPPRESS STOP NOTIFY DESCRIPTION + // ===== ============== ======== ====== ====== ======================== + AddSignal(32, "SIGTHR", false, false, false, "thread library AST"); + // clang-format on +} diff --git a/lldb/source/Plugins/Process/Utility/OpenBSDSignals.h b/lldb/source/Plugins/Process/Utility/OpenBSDSignals.h new file mode 100644 index 0000000000000..1e2b1fa9d26db --- /dev/null +++ b/lldb/source/Plugins/Process/Utility/OpenBSDSignals.h @@ -0,0 +1,27 @@ +//===-- OpenBSDSignals.h ----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SOURCE_PLUGINS_PROCESS_UTILITY_OPENBSDSIGNALS_H +#define LLDB_SOURCE_PLUGINS_PROCESS_UTILITY_OPENBSDSIGNALS_H + +#include "lldb/Target/UnixSignals.h" + +namespace lldb_private { + +/// OpenBSD specific set of Unix signals. +class OpenBSDSignals : public UnixSignals { +public: + OpenBSDSignals(); + +private: + void Reset() override; +}; + +} // namespace lldb_private + +#endif // LLDB_SOURCE_PLUGINS_PROCESS_UTILITY_OPENBSDSIGNALS_H diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp index 05b3bb9f54f9c..ef3c00e2857df 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp @@ -383,12 +383,12 @@ void ProcessMinidump::BuildMemoryRegions() { MemoryRegionInfos to_add; ModuleList &modules = GetTarget().GetImages(); - SectionLoadList &load_list = GetTarget().GetSectionLoadList(); + Target &target = GetTarget(); modules.ForEach([&](const ModuleSP &module_sp) { SectionList *sections = module_sp->GetSectionList(); for (size_t i = 0; i < sections->GetSize(); ++i) { SectionSP section_sp = sections->GetSectionAtIndex(i); - addr_t load_addr = load_list.GetSectionLoadAddress(section_sp); + addr_t load_addr = target.GetSectionLoadAddress(section_sp); if (load_addr == LLDB_INVALID_ADDRESS) continue; MemoryRegionInfo::RangeType section_range(load_addr, diff --git a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTBundleSaver.cpp b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTBundleSaver.cpp index a09bb372bb01c..3b1535a931999 100644 --- a/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTBundleSaver.cpp +++ b/lldb/source/Plugins/Trace/intel-pt/TraceIntelPTBundleSaver.cpp @@ -263,8 +263,7 @@ BuildModulesSection(Process &process, FileSpec directory) { lldb::addr_t load_addr = LLDB_INVALID_ADDRESS; Address base_addr(objfile->GetBaseAddress()); - if (base_addr.IsValid() && - !process.GetTarget().GetSectionLoadList().IsEmpty()) + if (base_addr.IsValid() && process.GetTarget().HasLoadedSections()) load_addr = base_addr.GetLoadAddress(&process.GetTarget()); if (load_addr == LLDB_INVALID_ADDRESS) diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp index c9523281dc565..15879f05a0ff0 100644 --- a/lldb/source/Symbol/Function.cpp +++ b/lldb/source/Symbol/Function.cpp @@ -488,7 +488,7 @@ lldb::DisassemblerSP Function::GetInstructions(const ExecutionContext &exe_ctx, if (module_sp && exe_ctx.HasTargetScope()) { return Disassembler::DisassembleRange( module_sp->GetArchitecture(), nullptr, nullptr, nullptr, flavor, - exe_ctx.GetTargetRef(), GetAddressRange(), !prefer_file_cache); + exe_ctx.GetTargetRef(), GetAddressRanges(), !prefer_file_cache); } return lldb::DisassemblerSP(); } diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp index d3881f8ccf7fe..264acad050e35 100644 --- a/lldb/source/Symbol/ObjectFile.cpp +++ b/lldb/source/Symbol/ObjectFile.cpp @@ -646,8 +646,7 @@ ObjectFile::GetLoadableData(Target &target) { for (size_t i = 0; i < section_count; ++i) { LoadableData loadable; SectionSP section_sp = section_list->GetSectionAtIndex(i); - loadable.Dest = - target.GetSectionLoadList().GetSectionLoadAddress(section_sp); + loadable.Dest = target.GetSectionLoadAddress(section_sp); if (loadable.Dest == LLDB_INVALID_ADDRESS) continue; // We can skip sections like bss diff --git a/lldb/source/Target/ProcessTrace.cpp b/lldb/source/Target/ProcessTrace.cpp index 4718a7ca50a7c..f131339905474 100644 --- a/lldb/source/Target/ProcessTrace.cpp +++ b/lldb/source/Target/ProcessTrace.cpp @@ -123,7 +123,7 @@ bool ProcessTrace::GetProcessInfo(ProcessInstanceInfo &info) { size_t ProcessTrace::DoReadMemory(addr_t addr, void *buf, size_t size, Status &error) { Address resolved_address; - GetTarget().GetSectionLoadList().ResolveLoadAddress(addr, resolved_address); + GetTarget().ResolveLoadAddress(addr, resolved_address); return GetTarget().ReadMemoryFromFileCache(resolved_address, buf, size, error); diff --git a/lldb/source/Target/SectionLoadHistory.cpp b/lldb/source/Target/SectionLoadHistory.cpp index f329d425e34b2..99797b1d1abc5 100644 --- a/lldb/source/Target/SectionLoadHistory.cpp +++ b/lldb/source/Target/SectionLoadHistory.cpp @@ -112,13 +112,15 @@ SectionLoadHistory::GetSectionLoadAddress(uint32_t stop_id, } bool SectionLoadHistory::ResolveLoadAddress(uint32_t stop_id, addr_t load_addr, - Address &so_addr) { + Address &so_addr, + bool allow_section_end) { // First find the top level section that this load address exists in std::lock_guard guard(m_mutex); const bool read_only = true; SectionLoadList *section_load_list = GetSectionLoadListForStopID(stop_id, read_only); - return section_load_list->ResolveLoadAddress(load_addr, so_addr); + return section_load_list->ResolveLoadAddress(load_addr, so_addr, + allow_section_end); } bool SectionLoadHistory::SetSectionLoadAddress( diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 46216ba2d566d..8d77097477651 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -270,12 +270,18 @@ void Target::DeleteCurrentProcess() { if (m_process_sp) { // We dispose any active tracing sessions on the current process m_trace_sp.reset(); - m_section_load_history.Clear(); + if (m_process_sp->IsAlive()) m_process_sp->Destroy(false); m_process_sp->Finalize(false /* not destructing */); + // Let the process finalize itself first, then clear the section load + // history. Some objects owned by the process might end up calling + // SectionLoadHistory::SetSectionUnloaded() which can create entries in + // the section load history that can mess up subsequent processes. + m_section_load_history.Clear(); + CleanupProcess(); m_process_sp.reset(); @@ -3217,8 +3223,9 @@ Status Target::Install(ProcessLaunchInfo *launch_info) { } bool Target::ResolveLoadAddress(addr_t load_addr, Address &so_addr, - uint32_t stop_id) { - return m_section_load_history.ResolveLoadAddress(stop_id, load_addr, so_addr); + uint32_t stop_id, bool allow_section_end) { + return m_section_load_history.ResolveLoadAddress(stop_id, load_addr, so_addr, + allow_section_end); } bool Target::ResolveFileAddress(lldb::addr_t file_addr, @@ -5147,3 +5154,15 @@ Target::ReportStatistics(const lldb_private::StatisticsOptions &options) { } void Target::ResetStatistics() { m_stats.Reset(*this); } + +bool Target::HasLoadedSections() { return !GetSectionLoadList().IsEmpty(); } + +lldb::addr_t Target::GetSectionLoadAddress(const lldb::SectionSP §ion_sp) { + return GetSectionLoadList().GetSectionLoadAddress(section_sp); +} + +void Target::ClearSectionLoadList() { GetSectionLoadList().Clear(); } + +void Target::DumpSectionLoadList(Stream &s) { + GetSectionLoadList().Dump(s, this); +} diff --git a/lldb/source/Target/ThreadPlanStepInRange.cpp b/lldb/source/Target/ThreadPlanStepInRange.cpp index 4a2ede8b39728..109d1b6b3435b 100644 --- a/lldb/source/Target/ThreadPlanStepInRange.cpp +++ b/lldb/source/Target/ThreadPlanStepInRange.cpp @@ -263,8 +263,7 @@ bool ThreadPlanStepInRange::ShouldStop(Event *event_ptr) { const Architecture *arch = GetTarget().GetArchitecturePlugin(); if (arch) { Address curr_sec_addr; - GetTarget().GetSectionLoadList().ResolveLoadAddress(curr_addr, - curr_sec_addr); + GetTarget().ResolveLoadAddress(curr_addr, curr_sec_addr); bytes_to_skip = arch->GetBytesToSkip(*sc.symbol, curr_sec_addr); } } diff --git a/lldb/source/Target/ThreadPlanTracer.cpp b/lldb/source/Target/ThreadPlanTracer.cpp index ff9f49c6d4bb6..356ce379c2993 100644 --- a/lldb/source/Target/ThreadPlanTracer.cpp +++ b/lldb/source/Target/ThreadPlanTracer.cpp @@ -140,8 +140,7 @@ void ThreadPlanAssemblyTracer::Log() { Address pc_addr; bool addr_valid = false; uint8_t buffer[16] = {0}; // Must be big enough for any single instruction - addr_valid = m_process.GetTarget().GetSectionLoadList().ResolveLoadAddress( - pc, pc_addr); + addr_valid = m_process.GetTarget().ResolveLoadAddress(pc, pc_addr); pc_addr.Dump(stream, &GetThread(), Address::DumpStyleResolvedDescription, Address::DumpStyleModuleWithFileAddress); diff --git a/lldb/source/Target/UnixSignals.cpp b/lldb/source/Target/UnixSignals.cpp index e3c7a83ece073..bee3a63818259 100644 --- a/lldb/source/Target/UnixSignals.cpp +++ b/lldb/source/Target/UnixSignals.cpp @@ -10,6 +10,7 @@ #include "Plugins/Process/Utility/FreeBSDSignals.h" #include "Plugins/Process/Utility/LinuxSignals.h" #include "Plugins/Process/Utility/NetBSDSignals.h" +#include "Plugins/Process/Utility/OpenBSDSignals.h" #include "lldb/Host/HostInfo.h" #include "lldb/Utility/ArchSpec.h" #include @@ -32,10 +33,11 @@ lldb::UnixSignalsSP UnixSignals::Create(const ArchSpec &arch) { case llvm::Triple::Linux: return std::make_shared(); case llvm::Triple::FreeBSD: - case llvm::Triple::OpenBSD: return std::make_shared(); case llvm::Triple::NetBSD: return std::make_shared(); + case llvm::Triple::OpenBSD: + return std::make_shared(); default: return std::make_shared(); } diff --git a/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s b/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s index a9e4104f2aaf7..2e2bc52cd3ff9 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s +++ b/lldb/test/Shell/ScriptInterpreter/Python/sb_function_ranges.s @@ -6,6 +6,16 @@ # CHECK: Found 1 function(s). # CHECK: foo: [input.o[0x0-0xe), input.o[0x14-0x1c)] +# CHECK-NEXT: input.o[0x0]: cmpl $0x0, %edi +# CHECK-NEXT: input.o[0x3]: je 0x14 +# CHECK-NEXT: input.o[0x5]: jmp 0x7 +# CHECK-NEXT: input.o[0x7]: callq 0xe +# CHECK-NEXT: input.o[0xc]: jmp 0x1b +# CHECK-EMPTY: +# CHECK-NEXT: input.o[0x14]: callq 0x19 +# CHECK-NEXT: input.o[0x19]: jmp 0x1b +# CHECK-NEXT: input.o[0x1b]: retq + #--- script.py import lldb @@ -17,6 +27,7 @@ def __lldb_init_module(debugger, internal_dict): for ctx in sym_ctxs: fn = ctx.function print(f"{fn.name}: {fn.GetRanges()}") + print(fn.GetInstructions(target)) #--- input.s # An example of a function which has been split into two parts. Roughly diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst index d357c2ceea418..8266d891a5e6b 100644 --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -301,7 +301,6 @@ OPTIONS * ``prepare-and-assemble-snippet``: Same as ``prepare-snippet``, but also dumps an excerpt of the sequence (hex encoded). * ``assemble-measured-code``: Same as ``prepare-and-assemble-snippet``. but also creates the full sequence that can be dumped to a file using ``--dump-object-to-disk``. * ``measure``: Same as ``assemble-measured-code``, but also runs the measurement. - * ``dry-run-measurement``: Same as measure, but does not actually execute the snippet. .. option:: --x86-lbr-sample-period= diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h index e9f0f089adfef..64598ce449a44 100644 --- a/llvm/include/llvm/CodeGen/ModuloSchedule.h +++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h @@ -359,8 +359,8 @@ class PeelingModuloScheduleExpander { MachineBasicBlock *CreateLCSSAExitingBlock(); /// Helper to get the stage of an instruction in the schedule. unsigned getStage(MachineInstr *MI) { - if (CanonicalMIs.count(MI)) - MI = CanonicalMIs[MI]; + if (auto It = CanonicalMIs.find(MI); It != CanonicalMIs.end()) + MI = It->second; return Schedule.getStage(MI); } /// Helper function to find the right canonical register for a phi instruction diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h index 2af9119670141..67bcb00787312 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h @@ -1387,10 +1387,26 @@ class LinkGraph { GetExternalSymbolMapEntryValue())); } + /// Returns the external symbol with the given name if one exists, otherwise + /// returns nullptr. + Symbol *findExternalSymbolByName(const orc::SymbolStringPtrBase &Name) { + for (auto *Sym : external_symbols()) + if (Sym->getName() == Name) + return Sym; + return nullptr; + } + iterator_range absolute_symbols() { return make_range(AbsoluteSymbols.begin(), AbsoluteSymbols.end()); } + Symbol *findAbsoluteSymbolByName(const orc::SymbolStringPtrBase &Name) { + for (auto *Sym : absolute_symbols()) + if (Sym->getName() == Name) + return Sym; + return nullptr; + } + iterator_range defined_symbols() { auto Secs = sections(); return make_range(defined_symbol_iterator(Secs.begin(), Secs.end()), @@ -1403,6 +1419,15 @@ class LinkGraph { const_defined_symbol_iterator(Secs.end(), Secs.end())); } + /// Returns the defined symbol with the given name if one exists, otherwise + /// returns nullptr. + Symbol *findDefinedSymbolByName(const orc::SymbolStringPtrBase &Name) { + for (auto *Sym : defined_symbols()) + if (Sym->hasName() && Sym->getName() == Name) + return Sym; + return nullptr; + } + /// Make the given symbol external (must not already be external). /// /// Symbol size, linkage and callability will be left unchanged. Symbol scope diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 7eceec3d8cf8f..6b6e5bc19d95a 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -2994,27 +2994,29 @@ class OpenMPIRBuilder { /// \param Loc where the target data construct was encountered. /// \param IsOffloadEntry whether it is an offload entry. /// \param CodeGenIP The insertion point where the call to the outlined - /// function should be emitted. + /// function should be emitted. /// \param EntryInfo The entry information about the function. /// \param DefaultAttrs Structure containing the default attributes, including /// numbers of threads and teams to launch the kernel with. /// \param RuntimeAttrs Structure containing the runtime numbers of threads /// and teams to launch the kernel with. + /// \param IfCond value of the `if` clause. /// \param Inputs The input values to the region that will be passed. - /// as arguments to the outlined function. + /// as arguments to the outlined function. /// \param BodyGenCB Callback that will generate the region code. /// \param ArgAccessorFuncCB Callback that will generate accessors - /// instructions for passed in target arguments where neccessary + /// instructions for passed in target arguments where neccessary /// \param Dependencies A vector of DependData objects that carry - // dependency information as passed in the depend clause - // \param HasNowait Whether the target construct has a `nowait` clause or not. + /// dependency information as passed in the depend clause + /// \param HasNowait Whether the target construct has a `nowait` clause or + /// not. InsertPointOrErrorTy createTarget( const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, - const TargetKernelRuntimeAttrs &RuntimeAttrs, + const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 0332a6cc2e76e..833c91fd97461 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -854,12 +854,13 @@ class IRBuilderBase { Value *Mask = nullptr); /// Create a call to Masked Expand Load intrinsic - CallInst *CreateMaskedExpandLoad(Type *Ty, Value *Ptr, Value *Mask = nullptr, + CallInst *CreateMaskedExpandLoad(Type *Ty, Value *Ptr, MaybeAlign Align, + Value *Mask = nullptr, Value *PassThru = nullptr, const Twine &Name = ""); /// Create a call to Masked Compress Store intrinsic - CallInst *CreateMaskedCompressStore(Value *Val, Value *Ptr, + CallInst *CreateMaskedCompressStore(Value *Val, Value *Ptr, MaybeAlign Align, Value *Mask = nullptr); /// Return an all true boolean vector (mask) with \p NumElts lanes. diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index b930d6983e225..b529642a55871 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -876,6 +876,8 @@ class AMDGPUSampleVariant extra_addr // Name of the {lod} or {clamp} argument that is appended to the coordinates, // if any. string LodOrClamp = ""; + + bit UsesWQM = false; } // AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE @@ -905,8 +907,9 @@ defset list AMDGPUSampleVariants = { } defset list AMDGPUSampleVariantsNoGradients = { + let UsesWQM = true in defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; - let Bias = true in + let Bias = true, UsesWQM = true in defm AMDGPUSample : AMDGPUSampleHelper_Clamp< "_B", "_b", [AMDGPUArg]>; let LodOrClamp = "lod" in @@ -1172,7 +1175,8 @@ defset list AMDGPUImageDimIntrinsics = { foreach dim = AMDGPUDims.NoMsaa in { def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< AMDGPUDimSampleProfile, - !if(NoMem, [IntrNoMem], [IntrReadMem]), + !listconcat(!if(NoMem, [IntrNoMem], [IntrReadMem]), + !if(sample.UsesWQM, [IntrConvergent], [])), !if(NoMem, [], [SDNPMemOperand])>; } } @@ -1188,7 +1192,8 @@ defset list AMDGPUImageDimIntrinsics = { foreach dim = AMDGPUDims.NoMsaa in { def !strconcat(NAME, "_", dim.Name, "_nortn") : AMDGPUImageDimIntrinsic< AMDGPUDimSampleNoReturnProfile, - [IntrWillReturn], [SDNPMemOperand]>; + !listconcat([IntrWillReturn], !if(sample.UsesWQM, [IntrConvergent], [])), + [SDNPMemOperand]>; } } foreach sample = AMDGPUSampleVariants in { diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 76914ab34c1f6..8097300c6e630 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -366,14 +366,26 @@ class Triple { /// @name Normalization /// @{ + /// Canonical form + enum class CanonicalForm { + ANY = 0, + THREE_IDENT = 3, // ARCHITECTURE-VENDOR-OPERATING_SYSTEM + FOUR_IDENT = 4, // ARCHITECTURE-VENDOR-OPERATING_SYSTEM-ENVIRONMENT + FIVE_IDENT = 5, // ARCHITECTURE-VENDOR-OPERATING_SYSTEM-ENVIRONMENT-FORMAT + }; + /// Turn an arbitrary machine specification into the canonical triple form (or /// something sensible that the Triple class understands if nothing better can /// reasonably be done). In particular, it handles the common case in which - /// otherwise valid components are in the wrong order. - static std::string normalize(StringRef Str); + /// otherwise valid components are in the wrong order. \p Form is used to + /// specify the output canonical form. + static std::string normalize(StringRef Str, + CanonicalForm Form = CanonicalForm::ANY); /// Return the normalized form of this triple's string. - std::string normalize() const { return normalize(Data); } + std::string normalize(CanonicalForm Form = CanonicalForm::ANY) const { + return normalize(Data, Form); + } /// @} /// @name Typed Component Access diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 349a0a1a2d3c4..20f69a0955f51 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -1159,6 +1159,27 @@ getRangeViaSLT(CmpInst::Predicate Pred, APInt RHS, return std::nullopt; } +/// Get value range for a "ctpop(Val) Pred RHS" condition. +static ValueLatticeElement getValueFromICmpCtpop(ICmpInst::Predicate Pred, + Value *RHS) { + unsigned BitWidth = RHS->getType()->getScalarSizeInBits(); + + auto *RHSConst = dyn_cast(RHS); + if (!RHSConst) + return ValueLatticeElement::getOverdefined(); + + ConstantRange ResValRange = + ConstantRange::makeExactICmpRegion(Pred, RHSConst->getValue()); + + unsigned ResMin = ResValRange.getUnsignedMin().getLimitedValue(BitWidth); + unsigned ResMax = ResValRange.getUnsignedMax().getLimitedValue(BitWidth); + + APInt ValMin = APInt::getLowBitsSet(BitWidth, ResMin); + APInt ValMax = APInt::getHighBitsSet(BitWidth, ResMax); + return ValueLatticeElement::getRange( + ConstantRange::getNonEmpty(std::move(ValMin), ValMax + 1)); +} + std::optional LazyValueInfoImpl::getValueFromICmpCondition( Value *Val, ICmpInst *ICI, bool isTrueDest, bool UseBlockValue) { Value *LHS = ICI->getOperand(0); @@ -1192,6 +1213,9 @@ std::optional LazyValueInfoImpl::getValueFromICmpCondition( return getValueFromSimpleICmpCondition(SwappedPred, LHS, Offset, ICI, UseBlockValue); + if (match(LHS, m_Intrinsic(m_Specific(Val)))) + return getValueFromICmpCtpop(EdgePred, RHS); + const APInt *Mask, *C; if (match(LHS, m_And(m_Specific(Val), m_APInt(Mask))) && match(RHS, m_APInt(C))) { diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index b664b54c044f5..6e3232772706a 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -183,6 +183,18 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, AATags); return MemoryLocation::getAfter(Arg, AATags); + case Intrinsic::experimental_memset_pattern: + assert((ArgIdx == 0 || ArgIdx == 1) && + "Invalid argument index for memory intrinsic"); + if (ConstantInt *LenCI = dyn_cast(II->getArgOperand(2))) + return MemoryLocation( + Arg, + LocationSize::precise( + LenCI->getZExtValue() * + DL.getTypeAllocSize(II->getArgOperand(1)->getType())), + AATags); + return MemoryLocation::getAfter(Arg, AATags); + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: case Intrinsic::invariant_start: diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 81d048b32e139..be6166f0c4169 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -941,8 +941,8 @@ bool LLParser::parseMDNodeID(MDNode *&Result) { return true; // If not a forward reference, just return it now. - if (NumberedMetadata.count(MID)) { - Result = NumberedMetadata[MID]; + if (auto It = NumberedMetadata.find(MID); It != NumberedMetadata.end()) { + Result = It->second; return false; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 94d3afa6c1e33..31c96400dd0fe 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -494,6 +494,9 @@ class IndexBitcodeWriter : public BitcodeWriterBase { // are currently saved in the index in terms of GUID. forEachSummary([&](GVInfo I, bool IsAliasee) { GUIDToValueIdMap[I.first] = ++GlobalValueId; + // If this is invoked for an aliasee, we want to record the above mapping, + // but not the information needed for its summary entry (if the aliasee is + // to be imported, we will invoke this separately with IsAliasee=false). if (IsAliasee) return; auto *FS = dyn_cast(I.second); @@ -4847,6 +4850,11 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { // radix tree array are identified based on this order. MapVector> CallStacks; forEachSummary([&](GVInfo I, bool IsAliasee) { + // Don't collect this when invoked for an aliasee, as it is not needed for + // the alias summary. If the aliasee is to be imported, we will invoke this + // separately with IsAliasee=false. + if (IsAliasee) + return; GlobalValueSummary *S = I.second; assert(S); auto *FS = dyn_cast(S); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index ba1b10ec8b9b1..a3392b7110989 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7690,8 +7690,8 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { // sunk instruction uses, if it is part of a chain that has already been // sunk. Instruction *OldI = cast(U->getUser()); - if (NewInstructions.count(OldI)) - NewInstructions[OldI]->setOperand(U->getOperandNo(), NI); + if (auto It = NewInstructions.find(OldI); It != NewInstructions.end()) + It->second->setOperand(U->getOperandNo(), NI); else U->set(NI); Changed = true; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 02b79c67af3ee..6805e0cb23ace 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -26160,26 +26160,27 @@ static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && "Shuffle mask value must be from operand 0"); - if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) - return SDValue(); - auto *InsIndexC = dyn_cast(Op0.getOperand(2)); - if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) - return SDValue(); + SDValue Elt; + if (sd_match(Op0, m_InsertElt(m_Value(), m_Value(Elt), + m_SpecificInt(Mask[ShufOp0Index])))) { + // There's an existing insertelement with constant insertion index, so we + // don't need to check the legality/profitability of a replacement operation + // that differs at most in the constant value. The target should be able to + // lower any of those in a similar way. If not, legalization will expand + // this to a scalar-to-vector plus shuffle. + // + // Note that the shuffle may move the scalar from the position that the + // insert element used. Therefore, our new insert element occurs at the + // shuffle's mask index value, not the insert's index value. + // + // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' + SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf)); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), + Op1, Elt, NewInsIndex); + } - // There's an existing insertelement with constant insertion index, so we - // don't need to check the legality/profitability of a replacement operation - // that differs at most in the constant value. The target should be able to - // lower any of those in a similar way. If not, legalization will expand this - // to a scalar-to-vector plus shuffle. - // - // Note that the shuffle may move the scalar from the position that the insert - // element used. Therefore, our new insert element occurs at the shuffle's - // mask index value, not the insert's index value. - // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' - SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf)); - return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), - Op1, Op0.getOperand(1), NewInsIndex); + return SDValue(); } /// If we have a unary shuffle of a shuffle, see if it can be folded away diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp index d3315aad126cb..e898d336dbe40 100644 --- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp @@ -635,32 +635,16 @@ Symbol *GetImageBaseSymbol::operator()(LinkGraph &G) { return *ImageBase; auto IBN = G.intern(ImageBaseName); + ImageBase = G.findExternalSymbolByName(IBN); + if (*ImageBase) + return *ImageBase; + ImageBase = G.findAbsoluteSymbolByName(IBN); + if (*ImageBase) + return *ImageBase; + ImageBase = G.findDefinedSymbolByName(IBN); + if (*ImageBase) + return *ImageBase; - // Check external symbols for image base. - for (auto *Sym : G.external_symbols()) { - if (Sym->getName() == IBN) { - ImageBase = Sym; - return Sym; - } - } - - // Check absolute symbols (unexpected, but legal). - for (auto *Sym : G.absolute_symbols()) { - if (Sym->getName() == IBN) { - ImageBase = Sym; - return Sym; - } - } - - // Finally, check defined symbols. - for (auto *Sym : G.defined_symbols()) { - if (Sym->hasName() && Sym->getName() == IBN) { - ImageBase = Sym; - return Sym; - } - } - - ImageBase = nullptr; return nullptr; } diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp index 8b6c88da52eb8..540dfdad5831b 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp @@ -361,14 +361,14 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, // Add a keep-alive edge from the FDE target to the FDE to ensure that the // FDE is kept alive if its target is. LLVM_DEBUG({ - dbgs() << " Adding keep-alive edge from target at " + dbgs() << " Adding keep-alive edge from target at " << (*PCBegin)->getBlock().getAddress() << " to FDE at " << RecordAddress << "\n"; }); (*PCBegin)->getBlock().addEdge(Edge::KeepAlive, 0, FDESymbol, 0); } else { LLVM_DEBUG({ - dbgs() << " WARNING: Not adding keep-alive edge to FDE at " + dbgs() << " WARNING: Not adding keep-alive edge to FDE at " << RecordAddress << ", which points to " << ((*PCBegin)->isExternal() ? "external" : "absolute") << " symbol \"" << (*PCBegin)->getName() @@ -395,7 +395,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B, .takeError()) return Err; } else { - LLVM_DEBUG(dbgs() << " Record does not have LSDA field.\n"); + LLVM_DEBUG(dbgs() << " Record does not have LSDA field.\n"); } return Error::success(); diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h index 49fbf650e7a77..841ec9c055a2e 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h @@ -26,6 +26,9 @@ namespace jitlink { class EHFrameEdgeFixer { public: /// Create an eh-frame edge fixer. + /// Adds edges for implicit relocations on platforms where these are used + /// (e.g. MachO/x86-64). + /// /// If a given edge-kind is not supported on the target architecture then /// Edge::Invalid should be used. EHFrameEdgeFixer(StringRef EHFrameSectionName, unsigned PointerSize, diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index e0d40cf2de5aa..8e66d028f21ce 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -1492,26 +1492,19 @@ Error MachOPlatform::MachOPlatformPlugin::populateObjCRuntimeObject( memcpy(SD.Sec.sectname, "__objc_imageinfo", 16); strcpy(SD.Sec.segname, "__DATA"); SD.Sec.size = 8; - SD.AddFixups = [&](size_t RecordOffset) { + jitlink::Symbol *ObjCImageInfoSym = nullptr; + SD.AddFixups = [&, ObjCImageInfoSym](size_t RecordOffset) mutable { auto PointerEdge = getPointerEdgeKind(G); // Look for an existing __objc_imageinfo symbol. - jitlink::Symbol *ObjCImageInfoSym = nullptr; - for (auto *Sym : G.external_symbols()) - if (Sym->hasName() && *Sym->getName() == ObjCImageInfoSymbolName) { - ObjCImageInfoSym = Sym; - break; - } - if (!ObjCImageInfoSym) - for (auto *Sym : G.absolute_symbols()) - if (Sym->hasName() && *Sym->getName() == ObjCImageInfoSymbolName) { - ObjCImageInfoSym = Sym; - break; - } - if (!ObjCImageInfoSym) - for (auto *Sym : G.defined_symbols()) - if (Sym->hasName() && *Sym->getName() == ObjCImageInfoSymbolName) { - ObjCImageInfoSym = Sym; + if (!ObjCImageInfoSym) { + auto Name = G.intern(ObjCImageInfoSymbolName); + ObjCImageInfoSym = G.findExternalSymbolByName(Name); + if (!ObjCImageInfoSym) + ObjCImageInfoSym = G.findAbsoluteSymbolByName(Name); + if (!ObjCImageInfoSym) { + ObjCImageInfoSym = G.findDefinedSymbolByName(Name); + if (ObjCImageInfoSym) { std::optional Flags; { std::lock_guard Lock(PluginMutex); @@ -1525,16 +1518,17 @@ Error MachOPlatform::MachOPlatformPlugin::populateObjCRuntimeObject( if (Flags) { // We own the definition of __objc_image_info; write the final // merged flags value. - auto Content = Sym->getBlock().getMutableContent(G); - assert(Content.size() == 8 && + auto Content = ObjCImageInfoSym->getBlock().getMutableContent(G); + assert( + Content.size() == 8 && "__objc_image_info size should have been verified already"); support::endian::write32(&Content[4], *Flags, G.getEndianness()); } - break; } - if (!ObjCImageInfoSym) - ObjCImageInfoSym = - &G.addExternalSymbol(ObjCImageInfoSymbolName, 8, false); + } + if (!ObjCImageInfoSym) + ObjCImageInfoSym = &G.addExternalSymbol(std::move(Name), 8, false); + } SecBlock.addEdge(PointerEdge, RecordOffset + ((char *)&SD.Sec.addr - (char *)&SD.Sec), diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 3d461f0ad4228..c6603635d5e28 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -5308,8 +5308,8 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, Value *Alignment = AlignedItem.second; Instruction *loadInst = dyn_cast(AlignedPtr); Builder.SetInsertPoint(loadInst->getNextNode()); - Builder.CreateAlignmentAssumption(F->getDataLayout(), - AlignedPtr, Alignment); + Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr, + Alignment); } Builder.restoreIP(IP); } @@ -5457,16 +5457,16 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { Loop *L = LI.getLoopFor(CLI->getHeader()); assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop"); - TargetTransformInfo::UnrollingPreferences UP = - gatherUnrollingPreferences(L, SE, TTI, - /*BlockFrequencyInfo=*/nullptr, - /*ProfileSummaryInfo=*/nullptr, ORE, static_cast(OptLevel), - /*UserThreshold=*/std::nullopt, - /*UserCount=*/std::nullopt, - /*UserAllowPartial=*/true, - /*UserAllowRuntime=*/true, - /*UserUpperBound=*/std::nullopt, - /*UserFullUnrollMaxCount=*/std::nullopt); + TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( + L, SE, TTI, + /*BlockFrequencyInfo=*/nullptr, + /*ProfileSummaryInfo=*/nullptr, ORE, static_cast(OptLevel), + /*UserThreshold=*/std::nullopt, + /*UserCount=*/std::nullopt, + /*UserAllowPartial=*/true, + /*UserAllowRuntime=*/true, + /*UserUpperBound=*/std::nullopt, + /*UserFullUnrollMaxCount=*/std::nullopt); UP.Force = true; @@ -7340,7 +7340,7 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, - Function *OutlinedFn, Constant *OutlinedFnID, + Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector Dependencies = {}, @@ -7386,9 +7386,9 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, return Error::success(); }; - // If we don't have an ID for the target region, it means an offload entry - // wasn't created. In this case we just run the host fallback directly. - if (!OutlinedFnID) { + auto &&EmitTargetCallElse = + [&](OpenMPIRBuilder::InsertPointTy AllocaIP, + OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error { // Assume no error was returned because EmitTargetCallFallbackCB doesn't // produce any. OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { @@ -7404,102 +7404,126 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, }()); Builder.restoreIP(AfterIP); - return; - } - - OpenMPIRBuilder::TargetDataInfo Info( - /*RequiresDevicePointerInfo=*/false, - /*SeparateBeginEndCalls=*/true); - - OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP()); - OpenMPIRBuilder::TargetDataRTArgs RTArgs; - OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info, - RTArgs, MapInfo, - /*IsNonContiguous=*/true, - /*ForEndCall=*/false); - - SmallVector NumTeamsC; - for (auto [DefaultVal, RuntimeVal] : - zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams)) - NumTeamsC.push_back(RuntimeVal ? RuntimeVal : Builder.getInt32(DefaultVal)); - - // Calculate number of threads: 0 if no clauses specified, otherwise it is the - // minimum between optional THREAD_LIMIT and NUM_THREADS clauses. - auto InitMaxThreadsClause = [&Builder](Value *Clause) { - if (Clause) - Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(), - /*isSigned=*/false); - return Clause; + return Error::success(); }; - auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) { - if (Clause) - Result = Result - ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause), + + auto &&EmitTargetCallThen = + [&](OpenMPIRBuilder::InsertPointTy AllocaIP, + OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error { + OpenMPIRBuilder::TargetDataInfo Info( + /*RequiresDevicePointerInfo=*/false, + /*SeparateBeginEndCalls=*/true); + + OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP()); + OpenMPIRBuilder::TargetDataRTArgs RTArgs; + OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info, + RTArgs, MapInfo, + /*IsNonContiguous=*/true, + /*ForEndCall=*/false); + + SmallVector NumTeamsC; + for (auto [DefaultVal, RuntimeVal] : + zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams)) + NumTeamsC.push_back(RuntimeVal ? RuntimeVal + : Builder.getInt32(DefaultVal)); + + // Calculate number of threads: 0 if no clauses specified, otherwise it is + // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses. + auto InitMaxThreadsClause = [&Builder](Value *Clause) { + if (Clause) + Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(), + /*isSigned=*/false); + return Clause; + }; + auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) { + if (Clause) + Result = + Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause), Result, Clause) : Clause; - }; + }; - // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so - // the NUM_THREADS clause is overriden by THREAD_LIMIT. - SmallVector NumThreadsC; - Value *MaxThreadsClause = RuntimeAttrs.TeamsThreadLimit.size() == 1 - ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads) - : nullptr; + // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so + // the NUM_THREADS clause is overriden by THREAD_LIMIT. + SmallVector NumThreadsC; + Value *MaxThreadsClause = + RuntimeAttrs.TeamsThreadLimit.size() == 1 + ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads) + : nullptr; - for (auto [TeamsVal, TargetVal] : zip_equal(RuntimeAttrs.TeamsThreadLimit, - RuntimeAttrs.TargetThreadLimit)) { - Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal); - Value *NumThreads = InitMaxThreadsClause(TargetVal); + for (auto [TeamsVal, TargetVal] : zip_equal( + RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) { + Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal); + Value *NumThreads = InitMaxThreadsClause(TargetVal); - CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads); - CombineMaxThreadsClauses(MaxThreadsClause, NumThreads); + CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads); + CombineMaxThreadsClauses(MaxThreadsClause, NumThreads); - NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0)); - } + NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0)); + } - unsigned NumTargetItems = Info.NumberOfPtrs; - // TODO: Use correct device ID - Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF); - uint32_t SrcLocStrSize; - Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize); - Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize, - llvm::omp::IdentFlag(0), 0); + unsigned NumTargetItems = Info.NumberOfPtrs; + // TODO: Use correct device ID + Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF); + uint32_t SrcLocStrSize; + Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize); + Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize, + llvm::omp::IdentFlag(0), 0); - Value *TripCount = RuntimeAttrs.LoopTripCount - ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount, - Builder.getInt64Ty(), - /*isSigned=*/false) - : Builder.getInt64(0); + Value *TripCount = RuntimeAttrs.LoopTripCount + ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount, + Builder.getInt64Ty(), + /*isSigned=*/false) + : Builder.getInt64(0); - // TODO: Use correct DynCGGroupMem - Value *DynCGGroupMem = Builder.getInt32(0); + // TODO: Use correct DynCGGroupMem + Value *DynCGGroupMem = Builder.getInt32(0); - KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount, - NumTeamsC, NumThreadsC, - DynCGGroupMem, HasNoWait); + KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount, + NumTeamsC, NumThreadsC, + DynCGGroupMem, HasNoWait); - // Assume no error was returned because TaskBodyCB and - // EmitTargetCallFallbackCB don't produce any. - OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { - // The presence of certain clauses on the target directive require the - // explicit generation of the target task. - if (RequiresOuterTargetTask) - return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP, - Dependencies, HasNoWait); + // Assume no error was returned because TaskBodyCB and + // EmitTargetCallFallbackCB don't produce any. + OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() { + // The presence of certain clauses on the target directive require the + // explicit generation of the target task. + if (RequiresOuterTargetTask) + return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP, + Dependencies, HasNoWait); + + return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID, + EmitTargetCallFallbackCB, KArgs, + DeviceID, RTLoc, AllocaIP); + }()); + + Builder.restoreIP(AfterIP); + return Error::success(); + }; - return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID, - EmitTargetCallFallbackCB, KArgs, - DeviceID, RTLoc, AllocaIP); - }()); + // If we don't have an ID for the target region, it means an offload entry + // wasn't created. In this case we just run the host fallback directly and + // ignore any potential 'if' clauses. + if (!OutlinedFnID) { + cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP())); + return; + } + + // If there's no 'if' clause, only generate the kernel launch code path. + if (!IfCond) { + cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP())); + return; + } - Builder.restoreIP(AfterIP); + cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen, + EmitTargetCallElse, AllocaIP)); } OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, - const TargetKernelRuntimeAttrs &RuntimeAttrs, + const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl &Args, GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, @@ -7524,7 +7548,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( // to make a remote call (offload) to the previously outlined function // that represents the target region. Do that now. if (!Config.isTargetDevice()) - emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, + emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies, HasNowait); return Builder.saveIP(); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 27b499e42a4e4..d46ae206890e8 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -644,13 +644,15 @@ CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs, /// Create a call to Masked Expand Load intrinsic /// \p Ty - vector type to load /// \p Ptr - base pointer for the load +/// \p Align - alignment of \p Ptr /// \p Mask - vector of booleans which indicates what vector lanes should /// be accessed in memory /// \p PassThru - pass-through value that is used to fill the masked-off lanes /// of the result /// \p Name - name of the result variable CallInst *IRBuilderBase::CreateMaskedExpandLoad(Type *Ty, Value *Ptr, - Value *Mask, Value *PassThru, + MaybeAlign Align, Value *Mask, + Value *PassThru, const Twine &Name) { assert(Ty->isVectorTy() && "Type should be vector"); assert(Mask && "Mask should not be all-ones (null)"); @@ -658,24 +660,32 @@ CallInst *IRBuilderBase::CreateMaskedExpandLoad(Type *Ty, Value *Ptr, PassThru = PoisonValue::get(Ty); Type *OverloadedTypes[] = {Ty}; Value *Ops[] = {Ptr, Mask, PassThru}; - return CreateMaskedIntrinsic(Intrinsic::masked_expandload, Ops, - OverloadedTypes, Name); + CallInst *CI = CreateMaskedIntrinsic(Intrinsic::masked_expandload, Ops, + OverloadedTypes, Name); + if (Align) + CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), *Align)); + return CI; } /// Create a call to Masked Compress Store intrinsic /// \p Val - data to be stored, /// \p Ptr - base pointer for the store +/// \p Align - alignment of \p Ptr /// \p Mask - vector of booleans which indicates what vector lanes should /// be accessed in memory CallInst *IRBuilderBase::CreateMaskedCompressStore(Value *Val, Value *Ptr, + MaybeAlign Align, Value *Mask) { Type *DataTy = Val->getType(); assert(DataTy->isVectorTy() && "Val should be a vector"); assert(Mask && "Mask should not be all-ones (null)"); Type *OverloadedTypes[] = {DataTy}; Value *Ops[] = {Val, Ptr, Mask}; - return CreateMaskedIntrinsic(Intrinsic::masked_compressstore, Ops, - OverloadedTypes); + CallInst *CI = CreateMaskedIntrinsic(Intrinsic::masked_compressstore, Ops, + OverloadedTypes); + if (Align) + CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), *Align)); + return CI; } template diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 94782547325ed..f698a3df08ef7 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -1320,7 +1320,7 @@ parseBoundsCheckingOptions(StringRef Params) { StringRef ParamEQ; StringRef Val; std::tie(ParamEQ, Val) = ParamName.split('='); - int8_t Id = 0; + int8_t Id; if (ParamEQ == "guard" && !Val.getAsInteger(0, Id)) { Options.GuardKind = Id; } else { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 27e9018d68a03..9fa9cccd3e3ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1926,7 +1926,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( KnownBits VKnown = CurDAG->computeKnownBits(VAddr); KnownBits SKnown = KnownBits::add(CurDAG->computeKnownBits(SAddr), - KnownBits::makeConstant(APInt(32, ImmOffset))); + KnownBits::makeConstant(APInt(32, ImmOffset, + /*isSigned=*/true))); uint64_t VMax = VKnown.getMaxValue().getZExtValue(); uint64_t SMax = SKnown.getMaxValue().getZExtValue(); return (VMax & 3) + (SMax & 3) >= 4; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 8b1b398606583..bac3bb5fde7b0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1346,7 +1346,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (Src1Ty->getNumElements() > Src1NumElts) { Src1 = IC.Builder.CreateExtractVector( - FixedVectorType::get(Src0Ty->getElementType(), Src1NumElts), Src1, + FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1, IC.Builder.getInt64(0)); MadeChange = true; } diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 024a64aceedbd..dad91c6a969e8 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -6663,9 +6663,9 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, StringRef ExtraToken, Mnemonic != "vshllt" && Mnemonic != "vrshrnt" && Mnemonic != "vshrnt" && Mnemonic != "vqrshrunt" && Mnemonic != "vqshrunt" && Mnemonic != "vqrshrnt" && Mnemonic != "vqshrnt" && Mnemonic != "vmullt" && - Mnemonic != "vqmovnt" && Mnemonic != "vqmovunt" && - Mnemonic != "vqmovnt" && Mnemonic != "vmovnt" && Mnemonic != "vqdmullt" && - Mnemonic != "vpnot" && Mnemonic != "vcvtt" && Mnemonic != "vcvt") { + Mnemonic != "vqmovnt" && Mnemonic != "vqmovunt" && Mnemonic != "vmovnt" && + Mnemonic != "vqdmullt" && Mnemonic != "vpnot" && Mnemonic != "vcvtt" && + Mnemonic != "vcvt") { unsigned VCC = ARMVectorCondCodeFromString(Mnemonic.substr(Mnemonic.size() - 1)); if (VCC != ~0U) { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 6a95d9ebef6c7..f8dc66d598025 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -207,33 +207,39 @@ class ValueToRegClass { // Some Common Instruction Class Templates //===----------------------------------------------------------------------===// +// Utility class to wrap up information about a register and DAG type for more +// convenient iteration and parameterization +class RegTyInfo { + ValueType Ty = ty; + NVPTXRegClass RC = rc; + Operand Imm = imm; + int Size = ty.Size; +} + +def I16RT : RegTyInfo; +def I32RT : RegTyInfo; +def I64RT : RegTyInfo; + // Template for instructions which take three int64, int32, or int16 args. // The instructions are named "" (e.g. "add.s64"). -multiclass I3 { - def i64rr : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, i64:$b))]>; - def i64ri : - NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set i64:$dst, (OpNode i64:$a, imm:$b))]>; - def i32rr : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, i32:$b))]>; - def i32ri : - NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set i32:$dst, (OpNode i32:$a, imm:$b))]>; - def i16rr : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set i16:$dst, (OpNode i16:$a, i16:$b))]>; - def i16ri : - NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set i16:$dst, (OpNode i16:$a, (imm):$b))]>; +multiclass I3 { + foreach t = [I16RT, I32RT, I64RT] in { + defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;"; + + def t.Ty # rr : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>; + def t.Ty # ri : + NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b), + asmstr, + [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>; + if !not(commutative) then + def t.Ty # ir : + NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b), + asmstr, + [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>; + } } class I16x2 : @@ -870,8 +876,8 @@ defm SUB_i1 : ADD_SUB_i1; // int16, int32, and int64 signed addition. Since nvptx is 2's complement, we // also use these for unsigned arithmetic. -defm ADD : I3<"add.s", add>; -defm SUB : I3<"sub.s", sub>; +defm ADD : I3<"add.s", add, /*commutative=*/ true>; +defm SUB : I3<"sub.s", sub, /*commutative=*/ false>; def ADD16x2 : I16x2<"add.s", add>; @@ -883,18 +889,18 @@ defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>; defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>; defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>; -defm MULT : I3<"mul.lo.s", mul>; +defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>; -defm MULTHS : I3<"mul.hi.s", mulhs>; -defm MULTHU : I3<"mul.hi.u", mulhu>; +defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>; +defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>; -defm SDIV : I3<"div.s", sdiv>; -defm UDIV : I3<"div.u", udiv>; +defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>; +defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>; // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM // will lower it. -defm SREM : I3<"rem.s", srem>; -defm UREM : I3<"rem.u", urem>; +defm SREM : I3<"rem.s", srem, /*commutative=*/ false>; +defm UREM : I3<"rem.u", urem, /*commutative=*/ false>; // Integer absolute value. NumBits should be one minus the bit width of RC. // This idiom implements the algorithm at @@ -909,10 +915,10 @@ defm ABS_32 : ABS; defm ABS_64 : ABS; // Integer min/max. -defm SMAX : I3<"max.s", smax>; -defm UMAX : I3<"max.u", umax>; -defm SMIN : I3<"min.s", smin>; -defm UMIN : I3<"min.u", umin>; +defm SMAX : I3<"max.s", smax, /*commutative=*/ true>; +defm UMAX : I3<"max.u", umax, /*commutative=*/ true>; +defm SMIN : I3<"min.s", smin, /*commutative=*/ true>; +defm UMIN : I3<"min.u", umin, /*commutative=*/ true>; def SMAX16x2 : I16x2<"max.s", smax>; def UMAX16x2 : I16x2<"max.u", umax>; @@ -1392,25 +1398,32 @@ def FDIV32ri_prec : // multiclass FMA { - def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, - Requires<[Pred]>; - def rri : NVPTXInst<(outs RC:$dst), - (ins RC:$a, RC:$b, ImmCls:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, - Requires<[Pred]>; - def rir : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, RC:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, - Requires<[Pred]>; - def rii : NVPTXInst<(outs RC:$dst), - (ins RC:$a, ImmCls:$b, ImmCls:$c), - !strconcat(OpcStr, " \t$dst, $a, $b, $c;"), - [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, - Requires<[Pred]>; + defvar asmstr = OpcStr # " \t$dst, $a, $b, $c;"; + def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>, + Requires<[Pred]>; + def rri : NVPTXInst<(outs RC:$dst), + (ins RC:$a, RC:$b, ImmCls:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>, + Requires<[Pred]>; + def rir : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, RC:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>, + Requires<[Pred]>; + def rii : NVPTXInst<(outs RC:$dst), + (ins RC:$a, ImmCls:$b, ImmCls:$c), + asmstr, + [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>, + Requires<[Pred]>; + def iir : NVPTXInst<(outs RC:$dst), + (ins ImmCls:$a, ImmCls:$b, RC:$c), + asmstr, + [(set RC:$dst, (fma fpimm:$a, fpimm:$b, RC:$c))]>, + Requires<[Pred]>; + } multiclass FMA_F16 { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 4f144cc641080..2d6ee2e28b4df 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -6,19 +6,6 @@ // //===----------------------------------------------------------------------===// -// Utility class to wrap up information about a register and DAG type for more -// convenient iteration and parameterization -class RegTyInfo { - ValueType Ty = ty; - NVPTXRegClass RC = rc; - Operand Imm = imm; - int Size = ty.Size; -} - -def I32RT : RegTyInfo; -def I64RT : RegTyInfo; - - def immFloat0 : PatLeaf<(fpimm), [{ float f = (float)N->getValueAPF().convertToFloat(); return (f==0.0f); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d4152ff4a816c..90e3e15b1fb46 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15672,12 +15672,16 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, (!isSingleSHUFPSMask(HalfMask) || Subtarget.hasFastVariableCrossLaneShuffle())) return SDValue(); - // If this is a unary shuffle (assume that the 2nd operand is + // If this is an unary shuffle (assume that the 2nd operand is // canonicalized to undef), then we can use vpermpd. Otherwise, we // are better off extracting the upper half of 1 operand and using a // narrow shuffle. if (EltWidth == 64 && V2.isUndef()) return SDValue(); + // If this is an unary vXi8 shuffle with inplace halves, then perform as + // full width pshufb, and then merge. + if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1) + return SDValue(); } // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. if (Subtarget.hasAVX512() && VT.is512BitVector()) diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index 4c1de09e91f21..ed58e72089839 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -1128,7 +1128,7 @@ static StringRef getDXILArchNameFromShaderModel(StringRef ShaderModelStr) { return Triple::getArchName(Triple::dxil, Triple::DXILSubArch_v1_0); } -std::string Triple::normalize(StringRef Str) { +std::string Triple::normalize(StringRef Str, CanonicalForm Form) { bool IsMinGW32 = false; bool IsCygwin = false; @@ -1334,6 +1334,19 @@ std::string Triple::normalize(StringRef Str) { Components[0] = getDXILArchNameFromShaderModel(Components[2]); } } + + // Canonicalize the components if necessary. + switch (Form) { + case CanonicalForm::ANY: + break; + case CanonicalForm::THREE_IDENT: + case CanonicalForm::FOUR_IDENT: + case CanonicalForm::FIVE_IDENT: { + Components.resize(static_cast(Form), "unknown"); + break; + } + } + // Stick the corrected components back together to form the normalized string. return join(Components, "-"); } @@ -2024,6 +2037,10 @@ bool Triple::isLittleEndian() const { } bool Triple::isCompatibleWith(const Triple &Other) const { + // On MinGW, C code is usually built with a "w64" vendor, while Rust + // often uses a "pc" vendor. + bool IgnoreVendor = isWindowsGNUEnvironment(); + // ARM and Thumb triples are compatible, if subarch, vendor and OS match. if ((getArch() == Triple::thumb && Other.getArch() == Triple::arm) || (getArch() == Triple::arm && Other.getArch() == Triple::thumb) || @@ -2034,17 +2051,24 @@ bool Triple::isCompatibleWith(const Triple &Other) const { getVendor() == Other.getVendor() && getOS() == Other.getOS(); else return getSubArch() == Other.getSubArch() && - getVendor() == Other.getVendor() && getOS() == Other.getOS() && + (getVendor() == Other.getVendor() || IgnoreVendor) && + getOS() == Other.getOS() && getEnvironment() == Other.getEnvironment() && getObjectFormat() == Other.getObjectFormat(); } - // If vendor is apple, ignore the version number. + // If vendor is apple, ignore the version number (the environment field) + // and the object format. if (getVendor() == Triple::Apple) return getArch() == Other.getArch() && getSubArch() == Other.getSubArch() && - getVendor() == Other.getVendor() && getOS() == Other.getOS(); - - return *this == Other; + (getVendor() == Other.getVendor() || IgnoreVendor) && + getOS() == Other.getOS(); + + return getArch() == Other.getArch() && getSubArch() == Other.getSubArch() && + (getVendor() == Other.getVendor() || IgnoreVendor) && + getOS() == Other.getOS() && + getEnvironment() == Other.getEnvironment() && + getObjectFormat() == Other.getObjectFormat(); } std::string Triple::merge(const Triple &Other) const { diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 429e323b6b7c2..0169320deae46 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3542,6 +3542,7 @@ struct MemorySanitizerVisitor : public InstVisitor { void handleMaskedExpandLoad(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *Ptr = I.getArgOperand(0); + MaybeAlign Align = I.getParamAlign(0); Value *Mask = I.getArgOperand(1); Value *PassThru = I.getArgOperand(2); @@ -3559,10 +3560,11 @@ struct MemorySanitizerVisitor : public InstVisitor { Type *ShadowTy = getShadowTy(&I); Type *ElementShadowTy = cast(ShadowTy)->getElementType(); auto [ShadowPtr, OriginPtr] = - getShadowOriginPtr(Ptr, IRB, ElementShadowTy, {}, /*isStore*/ false); + getShadowOriginPtr(Ptr, IRB, ElementShadowTy, Align, /*isStore*/ false); - Value *Shadow = IRB.CreateMaskedExpandLoad( - ShadowTy, ShadowPtr, Mask, getShadow(PassThru), "_msmaskedexpload"); + Value *Shadow = + IRB.CreateMaskedExpandLoad(ShadowTy, ShadowPtr, Align, Mask, + getShadow(PassThru), "_msmaskedexpload"); setShadow(&I, Shadow); @@ -3574,6 +3576,7 @@ struct MemorySanitizerVisitor : public InstVisitor { IRBuilder<> IRB(&I); Value *Values = I.getArgOperand(0); Value *Ptr = I.getArgOperand(1); + MaybeAlign Align = I.getParamAlign(1); Value *Mask = I.getArgOperand(2); if (ClCheckAccessAddress) { @@ -3585,9 +3588,9 @@ struct MemorySanitizerVisitor : public InstVisitor { Type *ElementShadowTy = getShadowTy(cast(Values->getType())->getElementType()); auto [ShadowPtr, OriginPtrs] = - getShadowOriginPtr(Ptr, IRB, ElementShadowTy, {}, /*isStore*/ true); + getShadowOriginPtr(Ptr, IRB, ElementShadowTy, Align, /*isStore*/ true); - IRB.CreateMaskedCompressStore(Shadow, ShadowPtr, Mask); + IRB.CreateMaskedCompressStore(Shadow, ShadowPtr, Align, Mask); // TODO: Store origins. } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 744faef192438..99f6a8860f0f4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4052,7 +4052,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); + ScalarEvolution *SE = PSE.getSE(); + unsigned TC = SE->getSmallConstantTripCount(TheLoop); unsigned MaxTC = PSE.getSmallConstantMaxTripCount(); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); if (TC != MaxTC) @@ -4064,6 +4065,22 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } + // If BTC matches the widest induction type and is -1 then the trip count + // computation will wrap to 0 and the vector trip count will be 0. Do not try + // to vectorize. + const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop); + if (!isa(BTC) && + BTC->getType()->getScalarSizeInBits() >= + Legal->getWidestInductionType()->getScalarSizeInBits() && + SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC, + SE->getMinusOne(BTC->getType()))) { + reportVectorizationFailure( + "Trip count computation wrapped", + "backedge-taken count is -1, loop trip count wrapped to 0", + "TripCountWrapped", ORE, TheLoop); + return FixedScalableVFPair::getNone(); + } + switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(MaxTC, UserVF, false); @@ -9065,8 +9082,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); auto *ScalarPH = Plan.getScalarPreheader(); auto *MiddleVPBB = cast(ScalarPH->getSinglePredecessor()); + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); VPBuilder VectorPHBuilder( - cast(Plan.getVectorLoopRegion()->getSinglePredecessor())); + cast(VectorRegion->getSinglePredecessor())); VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi()); VPBuilder ScalarPHBuilder(ScalarPH); VPValue *OneVPV = Plan.getOrAddLiveIn( @@ -9098,6 +9116,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) { // start value provides the value if the loop is bypassed. bool IsFOR = isa(VectorPhiR); auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue(); + assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && + "Cannot handle loops with uncountable early exits"); if (IsFOR) ResumeFromVectorLoop = MiddleBuilder.createNaryOp( VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {}, @@ -9267,6 +9287,9 @@ static void addExitUsersForFirstOrderRecurrences( if (!FOR) continue; + assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() && + "Cannot handle loops with uncountable early exits"); + // This is the second phase of vectorizing first-order recurrences, creating // extract for users outside the loop. An overview of the transformation is // described below. Suppose we have the following loop with some use after diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 1da185f9cfdf4..87f87bf143719 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1813,11 +1813,10 @@ class VPHistogramRecipe : public VPRecipeBase { }; /// A recipe for widening select instructions. -struct VPWidenSelectRecipe : public VPSingleDefRecipe { +struct VPWidenSelectRecipe : public VPRecipeWithIRFlags { template VPWidenSelectRecipe(SelectInst &I, iterator_range Operands) - : VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I, - I.getDebugLoc()) {} + : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I) {} ~VPWidenSelectRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4057a51155ece..979a8e0768a99 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1242,6 +1242,7 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN-SELECT "; printAsOperand(O, SlotTracker); O << " = select "; + printFlags(O); getOperand(0)->printAsOperand(O, SlotTracker); O << ", "; getOperand(1)->printAsOperand(O, SlotTracker); @@ -1266,6 +1267,8 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { Value *Op1 = State.get(getOperand(2)); Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1); State.set(this, Sel); + if (isa(Sel)) + setFlags(cast(Sel)); State.addMetadata(Sel, dyn_cast_or_null(getUnderlyingValue())); } diff --git a/llvm/test/Analysis/BasicAA/memset-pattern.ll b/llvm/test/Analysis/BasicAA/memset-pattern.ll new file mode 100644 index 0000000000000..aaa605db0eb26 --- /dev/null +++ b/llvm/test/Analysis/BasicAA/memset-pattern.ll @@ -0,0 +1,18 @@ +; RUN: opt -mtriple=x86_64 -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s + +define void @test_memset_pattern4_const_size(ptr noalias %a, i32 %pattern) { +; CHECK-LABEL: Function: test_memset_pattern4_const_size +; CHECK: Just Mod: Ptr: i8* %a <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) +; CHECK-NEXT: Just Mod: Ptr: i8* %a.gep.1 <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) +; CHECK-NEXT: NoModRef: Ptr: i8* %a.gep.129 <-> call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr %a, i32 %pattern, i64 17, i1 false) + +entry: + load i8, ptr %a + call void @llvm.experimental.memset.pattern(ptr %a, i32 %pattern, i64 17, i1 0) + %a.gep.1 = getelementptr i8, ptr %a, i32 1 + store i8 0, ptr %a.gep.1 + %a.gep.129 = getelementptr i8, ptr %a, i32 129 + store i8 1, ptr %a.gep.129 + + ret void +} diff --git a/llvm/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll b/llvm/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll index 22404b102a73f..26f60c00ae1bf 100644 --- a/llvm/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll +++ b/llvm/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll @@ -1,32 +1,39 @@ -; RUN: opt < %s "-passes=print" -disable-output 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes="print" -disable-output \ +; RUN: -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s ; PR1597 -; CHECK: Loop %bb: backedge-taken count is (-1 + (-1 * %x) + %y) - define i32 @f(i32 %x, i32 %y) { +; CHECK-LABEL: 'f' +; CHECK-NEXT: Determining loop execution counts for: @f +; CHECK-NEXT: Loop %bb: backedge-taken count is (-1 + (-1 * %x) + %y) +; CHECK-NEXT: Loop %bb: constant max backedge-taken count is i32 -1 +; CHECK-NEXT: Loop %bb: symbolic max backedge-taken count is (-1 + (-1 * %x) + %y) +; CHECK-NEXT: Loop %bb: Trip multiple is 1 +; entry: - %tmp63 = icmp ult i32 %x, %y ; [#uses=1] - br i1 %tmp63, label %bb.preheader, label %bb8 + %tmp63 = icmp ult i32 %x, %y ; [#uses=1] + br i1 %tmp63, label %bb.preheader, label %bb8 bb.preheader: ; preds = %entry - br label %bb + br label %bb bb: ; preds = %bb3, %bb.preheader - %x_addr.0 = phi i32 [ %tmp2, %bb3 ], [ %x, %bb.preheader ] ; [#uses=1] - %tmp2 = add i32 %x_addr.0, 1 ; [#uses=3] - br label %bb3 + %x_addr.0 = phi i32 [ %tmp2, %bb3 ], [ %x, %bb.preheader ] ; [#uses=1] + %tmp2 = add i32 %x_addr.0, 1 ; [#uses=3] + br label %bb3 bb3: ; preds = %bb - %tmp6 = icmp ult i32 %tmp2, %y ; [#uses=1] - br i1 %tmp6, label %bb, label %bb8.loopexit + %tmp6 = icmp ult i32 %tmp2, %y ; [#uses=1] + br i1 %tmp6, label %bb, label %bb8.loopexit bb8.loopexit: ; preds = %bb3 - br label %bb8 + br label %bb8 bb8: ; preds = %bb8.loopexit, %entry - %x_addr.1 = phi i32 [ %x, %entry ], [ %tmp2, %bb8.loopexit ] ; [#uses=1] - br label %return + %x_addr.1 = phi i32 [ %x, %entry ], [ %tmp2, %bb8.loopexit ] ; [#uses=1] + br label %return return: ; preds = %bb8 - ret i32 %x_addr.1 + ret i32 %x_addr.1 } diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll deleted file mode 100644 index 4a0ebf810568e..0000000000000 --- a/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: opt -passes=indvars -S < %s | FileCheck %s - -declare void @use(i1) - -declare void @llvm.experimental.guard(i1, ...) - -define void @test_01(i8 %t) { -; CHECK-LABEL: test_01 - entry: - %st = sext i8 %t to i16 - %cmp1 = icmp slt i16 %st, 42 - call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ] - br label %loop - - loop: -; CHECK-LABEL: loop - %idx = phi i8 [ %t, %entry ], [ %idx.inc, %loop ] - %idx.inc = add i8 %idx, 1 - %c = icmp slt i8 %idx, 42 -; CHECK: call void @use(i1 true) - call void @use(i1 %c) - %be = icmp slt i8 %idx.inc, 42 - br i1 %be, label %loop, label %exit - - exit: - ret void -} - -define void @test_02(i8 %t) { -; CHECK-LABEL: test_02 - entry: - %t.ptr = inttoptr i8 %t to ptr - %p.42 = inttoptr i8 42 to ptr - %cmp1 = icmp slt ptr %t.ptr, %p.42 - call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ] - br label %loop - - loop: -; CHECK-LABEL: loop - %idx = phi ptr [ %t.ptr, %entry ], [ %snext, %loop ] - %snext = getelementptr inbounds i8, ptr %idx, i64 1 - %c = icmp slt ptr %idx, %p.42 -; CHECK: call void @use(i1 true) - call void @use(i1 %c) - %be = icmp slt ptr %snext, %p.42 - br i1 %be, label %loop, label %exit - - exit: - ret void -} diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll index 6d38f510c4997..fbe69b4b18897 100644 --- a/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll +++ b/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll @@ -1,11 +1,18 @@ -; RUN: opt < %s -disable-output "-passes=print" 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -disable-output -passes="print" \ +; RUN: -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s declare void @llvm.experimental.guard(i1, ...) define void @test_1(i32 %n) nounwind { ; Prove that (n > 1) ===> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_1 -; CHECK: Loop %header: backedge-taken count is (-1 + %n.div.2) +; CHECK-LABEL: 'test_1' +; CHECK-NEXT: Determining loop execution counts for: @test_1 +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + %n.div.2) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + %n.div.2) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, 1 %n.div.2 = sdiv i32 %n, 2 @@ -24,8 +31,13 @@ exit: define void @test_1neg(i32 %n) nounwind { ; Prove that (n > 0) =\=> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_1neg -; CHECK: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-LABEL: 'test_1neg' +; CHECK-NEXT: Determining loop execution counts for: @test_1neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, 0 %n.div.2 = sdiv i32 %n, 2 @@ -44,8 +56,13 @@ exit: define void @test_2(i32 %n) nounwind { ; Prove that (n >= 2) ===> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_2 -; CHECK: Loop %header: backedge-taken count is (-1 + %n.div.2) +; CHECK-LABEL: 'test_2' +; CHECK-NEXT: Determining loop execution counts for: @test_2 +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + %n.div.2) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + %n.div.2) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, 2 %n.div.2 = sdiv i32 %n, 2 @@ -64,8 +81,13 @@ exit: define void @test_2neg(i32 %n) nounwind { ; Prove that (n >= 1) =\=> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_2neg -; CHECK: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-LABEL: 'test_2neg' +; CHECK-NEXT: Determining loop execution counts for: @test_2neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n.div.2)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, 1 %n.div.2 = sdiv i32 %n, 2 @@ -84,8 +106,13 @@ exit: define void @test_3(i32 %n) nounwind { ; Prove that (n > -2) ===> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_3 -; CHECK: Loop %header: backedge-taken count is (1 + %n.div.2) +; CHECK-LABEL: 'test_3' +; CHECK-NEXT: Determining loop execution counts for: @test_3 +; CHECK-NEXT: Loop %header: backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, -2 %n.div.2 = sdiv i32 %n, 2 @@ -104,8 +131,13 @@ exit: define void @test_3neg(i32 %n) nounwind { ; Prove that (n > -3) =\=> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_3neg -; CHECK: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-LABEL: 'test_3neg' +; CHECK-NEXT: Determining loop execution counts for: @test_3neg +; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, -3 %n.div.2 = sdiv i32 %n, 2 @@ -124,8 +156,13 @@ exit: define void @test_4(i32 %n) nounwind { ; Prove that (n >= -1) ===> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_4 -; CHECK: Loop %header: backedge-taken count is (1 + %n.div.2) +; CHECK-LABEL: 'test_4' +; CHECK-NEXT: Determining loop execution counts for: @test_4 +; CHECK-NEXT: Loop %header: backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + %n.div.2) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, -1 %n.div.2 = sdiv i32 %n, 2 @@ -144,8 +181,13 @@ exit: define void @test_4neg(i32 %n) nounwind { ; Prove that (n >= -2) =\=> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_4neg -; CHECK: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-LABEL: 'test_4neg' +; CHECK-NEXT: Determining loop execution counts for: @test_4neg +; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + %n.div.2)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, -2 %n.div.2 = sdiv i32 %n, 2 @@ -164,8 +206,13 @@ exit: define void @test_ext_01(i32 %n) nounwind { ; Prove that (n > 1) ===> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_ext_01 -; CHECK: Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-LABEL: 'test_ext_01' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_01 +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, 1 %n.div.2 = sdiv i32 %n, 2 @@ -185,8 +232,13 @@ exit: define void @test_ext_01neg(i32 %n) nounwind { ; Prove that (n > 0) =\=> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_ext_01neg -; CHECK: Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-LABEL: 'test_ext_01neg' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_01neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, 0 %n.div.2 = sdiv i32 %n, 2 @@ -206,8 +258,13 @@ exit: define void @test_ext_02(i32 %n) nounwind { ; Prove that (n >= 2) ===> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_ext_02 -; CHECK: Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-LABEL: 'test_ext_02' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_02 +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, 2 %n.div.2 = sdiv i32 %n, 2 @@ -227,8 +284,13 @@ exit: define void @test_ext_02neg(i32 %n) nounwind { ; Prove that (n >= 1) =\=> (n / 2 > 0). -; CHECK: Determining loop execution counts for: @test_ext_02neg -; CHECK: Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-LABEL: 'test_ext_02neg' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_02neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741822 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, 1 %n.div.2 = sdiv i32 %n, 2 @@ -248,8 +310,13 @@ exit: define void @test_ext_03(i32 %n) nounwind { ; Prove that (n > -2) ===> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_ext_03 -; CHECK: Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-LABEL: 'test_ext_03' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_03 +; CHECK-NEXT: Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, -2 %n.div.2 = sdiv i32 %n, 2 @@ -269,8 +336,13 @@ exit: define void @test_ext_03neg(i32 %n) nounwind { ; Prove that (n > -3) =\=> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_ext_03neg -; CHECK: Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-LABEL: 'test_ext_03neg' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_03neg +; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sgt i32 %n, -3 %n.div.2 = sdiv i32 %n, 2 @@ -290,8 +362,13 @@ exit: define void @test_ext_04(i32 %n) nounwind { ; Prove that (n >= -1) ===> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_ext_04 -; CHECK: Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-LABEL: 'test_ext_04' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_04 +; CHECK-NEXT: Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + (sext i32 %n.div.2 to i64)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, -1 %n.div.2 = sdiv i32 %n, 2 @@ -311,8 +388,13 @@ exit: define void @test_ext_04neg(i32 %n) nounwind { ; Prove that (n >= -2) =\=> (n / 2 >= 0). -; CHECK: Determining loop execution counts for: @test_ext_04neg -; CHECK: Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-LABEL: 'test_ext_04neg' +; CHECK-NEXT: Determining loop execution counts for: @test_ext_04neg +; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i64 1073741824 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; entry: %cmp1 = icmp sge i32 %n, -2 %n.div.2 = sdiv i32 %n, 2 diff --git a/llvm/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll b/llvm/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll index a8b891b5afb23..677463ee63225 100644 --- a/llvm/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll +++ b/llvm/test/Analysis/ScalarEvolution/infer-prestart-no-wrap.ll @@ -1,7 +1,29 @@ -; ; RUN: opt -disable-output "-passes=print" < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -disable-output -passes="print" < %s 2>&1 | FileCheck %s define void @infer.sext.0(ptr %c, i32 %start, ptr %buf) { -; CHECK-LABEL: Classifying expressions for: @infer.sext.0 +; CHECK-LABEL: 'infer.sext.0' +; CHECK-NEXT: Classifying expressions for: @infer.sext.0 +; CHECK-NEXT: %counter = phi i32 [ 0, %entry ], [ %counter.inc, %loop ] +; CHECK-NEXT: --> {0,+,1}<%loop> U: [0,2) S: [0,2) Exits: 1 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {%start,+,1}<%loop> U: full-set S: full-set Exits: (1 + %start) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nsw i32 %idx, 1 +; CHECK-NEXT: --> {(1 + %start),+,1}<%loop> U: full-set S: full-set Exits: (2 + %start) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc.sext = sext i32 %idx.inc to i64 +; CHECK-NEXT: --> {(1 + (sext i32 %start to i64)),+,1}<%loop> U: [-2147483647,2147483650) S: [-2147483647,2147483650) Exits: (2 + (sext i32 %start to i64)) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %buf.gep = getelementptr inbounds i32, ptr %buf, i32 %idx.inc +; CHECK-NEXT: --> {(4 + (4 * (sext i32 %start to i64)) + %buf),+,4}<%loop> U: full-set S: full-set Exits: (8 + (4 * (sext i32 %start to i64)) + %buf) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %val = load i32, ptr %buf.gep, align 4 +; CHECK-NEXT: --> %val U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: %counter.inc = add i32 %counter, 1 +; CHECK-NEXT: --> {1,+,1}<%loop> U: [1,3) S: [1,3) Exits: 2 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @infer.sext.0 +; CHECK-NEXT: Loop %loop: backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: Trip multiple is 2 +; entry: br label %loop @@ -10,8 +32,6 @@ define void @infer.sext.0(ptr %c, i32 %start, ptr %buf) { %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ] %idx.inc = add nsw i32 %idx, 1 %idx.inc.sext = sext i32 %idx.inc to i64 -; CHECK: %idx.inc.sext = sext i32 %idx.inc to i64 -; CHECK-NEXT: --> {(1 + (sext i32 %start to i64)),+,1}<%loop> %buf.gep = getelementptr inbounds i32, ptr %buf, i32 %idx.inc %val = load i32, ptr %buf.gep @@ -25,7 +45,28 @@ define void @infer.sext.0(ptr %c, i32 %start, ptr %buf) { } define void @infer.zext.0(ptr %c, i32 %start, ptr %buf) { -; CHECK-LABEL: Classifying expressions for: @infer.zext.0 +; CHECK-LABEL: 'infer.zext.0' +; CHECK-NEXT: Classifying expressions for: @infer.zext.0 +; CHECK-NEXT: %counter = phi i32 [ 0, %entry ], [ %counter.inc, %loop ] +; CHECK-NEXT: --> {0,+,1}<%loop> U: [0,2) S: [0,2) Exits: 1 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {%start,+,1}<%loop> U: full-set S: full-set Exits: (1 + %start) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nuw i32 %idx, 1 +; CHECK-NEXT: --> {(1 + %start),+,1}<%loop> U: [1,0) S: [1,0) Exits: (2 + %start) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc.sext = zext i32 %idx.inc to i64 +; CHECK-NEXT: --> {(1 + (zext i32 %start to i64)),+,1}<%loop> U: [1,4294967298) S: [1,4294967298) Exits: (2 + (zext i32 %start to i64)) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %buf.gep = getelementptr inbounds i32, ptr %buf, i32 %idx.inc +; CHECK-NEXT: --> ((4 * (sext i32 {(1 + %start),+,1}<%loop> to i64)) + %buf) U: full-set S: full-set Exits: ((4 * (sext i32 (2 + %start) to i64)) + %buf) LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %val = load i32, ptr %buf.gep, align 4 +; CHECK-NEXT: --> %val U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: %counter.inc = add i32 %counter, 1 +; CHECK-NEXT: --> {1,+,1}<%loop> U: [1,3) S: [1,3) Exits: 2 LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @infer.zext.0 +; CHECK-NEXT: Loop %loop: backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is i32 1 +; CHECK-NEXT: Loop %loop: Trip multiple is 2 +; entry: br label %loop @@ -34,8 +75,6 @@ define void @infer.zext.0(ptr %c, i32 %start, ptr %buf) { %idx = phi i32 [ %start, %entry ], [ %idx.inc, %loop ] %idx.inc = add nuw i32 %idx, 1 %idx.inc.sext = zext i32 %idx.inc to i64 -; CHECK: %idx.inc.sext = zext i32 %idx.inc to i64 -; CHECK-NEXT: --> {(1 + (zext i32 %start to i64)),+,1}<%loop> %buf.gep = getelementptr inbounds i32, ptr %buf, i32 %idx.inc %val = load i32, ptr %buf.gep @@ -49,7 +88,25 @@ define void @infer.zext.0(ptr %c, i32 %start, ptr %buf) { } define void @infer.sext.1(i32 %start, ptr %c) { -; CHECK-LABEL: Classifying expressions for: @infer.sext.1 +; CHECK-LABEL: 'infer.sext.1' +; CHECK-NEXT: Classifying expressions for: @infer.sext.1 +; CHECK-NEXT: %start.mul = mul i32 %start, 4 +; CHECK-NEXT: --> (4 * %start) U: [0,-3) S: [-2147483648,2147483645) +; CHECK-NEXT: %start.real = add i32 %start.mul, 2 +; CHECK-NEXT: --> (2 + (4 * %start)) U: [2,-1) S: [-2147483646,2147483647) +; CHECK-NEXT: %idx = phi i32 [ %start.real, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {(2 + (4 * %start)),+,2}<%loop> U: [0,-1) S: [-2147483646,2147483647) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.sext = sext i32 %idx to i64 +; CHECK-NEXT: --> {(2 + (sext i32 (4 * %start) to i64)),+,2}<%loop> U: [0,-1) S: [-2147483646,9223372036854775807) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nsw i32 %idx, 2 +; CHECK-NEXT: --> {(4 + (4 * %start)),+,2}<%loop> U: [0,-1) S: [-2147483648,2147483647) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %condition = load i1, ptr %c, align 1 +; CHECK-NEXT: --> %condition U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: Determining loop execution counts for: @infer.sext.1 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; entry: %start.mul = mul i32 %start, 4 %start.real = add i32 %start.mul, 2 @@ -58,8 +115,6 @@ define void @infer.sext.1(i32 %start, ptr %c) { loop: %idx = phi i32 [ %start.real, %entry ], [ %idx.inc, %loop ] %idx.sext = sext i32 %idx to i64 -; CHECK: %idx.sext = sext i32 %idx to i64 -; CHECK-NEXT: --> {(2 + (sext i32 (4 * %start) to i64)),+,2}<%loop> %idx.inc = add nsw i32 %idx, 2 %condition = load i1, ptr %c br i1 %condition, label %exit, label %loop @@ -69,7 +124,23 @@ define void @infer.sext.1(i32 %start, ptr %c) { } define void @infer.sext.2(ptr %c, i8 %start) { -; CHECK-LABEL: Classifying expressions for: @infer.sext.2 +; CHECK-LABEL: 'infer.sext.2' +; CHECK-NEXT: Classifying expressions for: @infer.sext.2 +; CHECK-NEXT: %start.inc = add i8 %start, 1 +; CHECK-NEXT: --> (1 + %start) U: full-set S: full-set +; CHECK-NEXT: %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {(1 + %start),+,1}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.sext = sext i8 %idx to i16 +; CHECK-NEXT: --> {(1 + (sext i8 %start to i16)),+,1}<%loop> U: [-127,-32768) S: [-127,-32768) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nsw i8 %idx, 1 +; CHECK-NEXT: --> {(2 + %start),+,1}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %condition = load volatile i1, ptr %c, align 1 +; CHECK-NEXT: --> %condition U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: Determining loop execution counts for: @infer.sext.2 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; entry: %start.inc = add i8 %start, 1 %entry.condition = icmp slt i8 %start, 127 @@ -78,8 +149,6 @@ define void @infer.sext.2(ptr %c, i8 %start) { loop: %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ] %idx.sext = sext i8 %idx to i16 -; CHECK: %idx.sext = sext i8 %idx to i16 -; CHECK-NEXT: --> {(1 + (sext i8 %start to i16)),+,1}<%loop> %idx.inc = add nsw i8 %idx, 1 %condition = load volatile i1, ptr %c br i1 %condition, label %exit, label %loop @@ -89,7 +158,23 @@ define void @infer.sext.2(ptr %c, i8 %start) { } define void @infer.zext.1(ptr %c, i8 %start) { -; CHECK-LABEL: Classifying expressions for: @infer.zext.1 +; CHECK-LABEL: 'infer.zext.1' +; CHECK-NEXT: Classifying expressions for: @infer.zext.1 +; CHECK-NEXT: %start.inc = add i8 %start, 1 +; CHECK-NEXT: --> (1 + %start) U: full-set S: full-set +; CHECK-NEXT: %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ] +; CHECK-NEXT: --> {(1 + %start),+,1}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.zext = zext i8 %idx to i16 +; CHECK-NEXT: --> {(1 + (zext i8 %start to i16)),+,1}<%loop> U: [1,0) S: [1,0) Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %idx.inc = add nuw i8 %idx, 1 +; CHECK-NEXT: --> {(2 + %start),+,1}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %condition = load volatile i1, ptr %c, align 1 +; CHECK-NEXT: --> %condition U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Variant } +; CHECK-NEXT: Determining loop execution counts for: @infer.zext.1 +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; entry: %start.inc = add i8 %start, 1 %entry.condition = icmp ult i8 %start, 255 @@ -98,8 +183,6 @@ define void @infer.zext.1(ptr %c, i8 %start) { loop: %idx = phi i8 [ %start.inc, %entry ], [ %idx.inc, %loop ] %idx.zext = zext i8 %idx to i16 -; CHECK: %idx.zext = zext i8 %idx to i16 -; CHECK-NEXT: --> {(1 + (zext i8 %start to i16)),+,1}<%loop> %idx.inc = add nuw i8 %idx, 1 %condition = load volatile i1, ptr %c br i1 %condition, label %exit, label %loop diff --git a/llvm/test/CodeGen/AArch64/adds_cmn.ll b/llvm/test/CodeGen/AArch64/adds_cmn.ll new file mode 100644 index 0000000000000..674a3893653a1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/adds_cmn.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -O3 -o - %s | FileCheck %s + +define { i32, i32 } @adds_cmn(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: adds_cmn: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmn w0, w1 +; CHECK-NEXT: add w1, w0, w1 +; CHECK-NEXT: cset w8, lo +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret +entry: + %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y) + %_4.1 = extractvalue { i32, i1 } %0, 1 + %_5 = add nuw i32 %x, %y + %_0.sroa.3.0 = select i1 %_4.1, i32 undef, i32 %_5 + %not._4.1 = xor i1 %_4.1, true + %_0.sroa.0.0 = zext i1 %not._4.1 to i32 + %1 = insertvalue { i32, i32 } poison, i32 %_0.sroa.0.0, 0 + %2 = insertvalue { i32, i32 } %1, i32 %_0.sroa.3.0, 1 + ret { i32, i32 } %2 +} + +define { i32, i32 } @adds_cmn_c(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: adds_cmn_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmn w0, w1 +; CHECK-NEXT: add w1, w1, w0 +; CHECK-NEXT: cset w8, lo +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret +entry: + %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y) + %_4.1 = extractvalue { i32, i1 } %0, 1 + %_5 = add nuw i32 %y, %x + %_0.sroa.3.0 = select i1 %_4.1, i32 undef, i32 %_5 + %not._4.1 = xor i1 %_4.1, true + %_0.sroa.0.0 = zext i1 %not._4.1 to i32 + %1 = insertvalue { i32, i32 } poison, i32 %_0.sroa.0.0, 0 + %2 = insertvalue { i32, i32 } %1, i32 %_0.sroa.3.0, 1 + ret { i32, i32 } %2 +} + +define { i32, i32 } @subs_cmp(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: subs_cmp: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs w1, w0, w1 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret +entry: + %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 %y) + %_4.1 = extractvalue { i32, i1 } %0, 1 + %_5 = sub nuw i32 %x, %y + %_0.sroa.3.0 = select i1 %_4.1, i32 undef, i32 %_5 + %not._4.1 = xor i1 %_4.1, true + %_0.sroa.0.0 = zext i1 %not._4.1 to i32 + %1 = insertvalue { i32, i32 } poison, i32 %_0.sroa.0.0, 0 + %2 = insertvalue { i32, i32 } %1, i32 %_0.sroa.3.0, 1 + ret { i32, i32 } %2 +} + +define { i32, i32 } @subs_cmp_c(i32 noundef %x, i32 noundef %y) { +; CHECK-LABEL: subs_cmp_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: sub w1, w1, w0 +; CHECK-NEXT: cset w8, hs +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret +entry: + %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 %y) + %_4.1 = extractvalue { i32, i1 } %0, 1 + %_5 = sub nuw i32 %y, %x + %_0.sroa.3.0 = select i1 %_4.1, i32 undef, i32 %_5 + %not._4.1 = xor i1 %_4.1, true + %_0.sroa.0.0 = zext i1 %not._4.1 to i32 + %1 = insertvalue { i32, i32 } poison, i32 %_0.sroa.0.0, 0 + %2 = insertvalue { i32, i32 } %1, i32 %_0.sroa.3.0, 1 + ret { i32, i32 } %2 +} diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 066c04b1af088..ef3657433e8b7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -1243,3 +1243,81 @@ bb: store volatile i8 4, ptr addrspace(5) %p4 ret void } + +define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) { +; GFX940-SDAG-LABEL: soff1_voff1_negative: +; GFX940-SDAG: ; %bb.0: ; %bb +; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX940-SDAG-NEXT: s_endpgm +; +; GFX940-GISEL-LABEL: soff1_voff1_negative: +; GFX940-GISEL: ; %bb.0: ; %bb +; GFX940-GISEL-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX940-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX940-GISEL-NEXT: v_add3_u32 v0, s0, v0, -1 +; GFX940-GISEL-NEXT: scratch_store_byte v0, v1, off sc0 sc1 +; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX940-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: soff1_voff1_negative: +; GFX11-SDAG: ; %bb.0: ; %bb +; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add3_u32 v0, 0, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: soff1_voff1_negative: +; GFX11-GISEL: ; %bb.0: ; %bb +; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_add_u32 s0, 0, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX12-SDAG-LABEL: soff1_voff1_negative: +; GFX12-SDAG: ; %bb.0: ; %bb +; GFX12-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS +; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: soff1_voff1_negative: +; GFX12-GISEL: ; %bb.0: ; %bb +; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: s_add_co_u32 s0, 0, s0 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX12-GISEL-NEXT: s_endpgm +bb: + %a = alloca [64 x i8], align 4, addrspace(5) + %as = getelementptr i8, ptr addrspace(5) %a, i32 %soff + %voff = call i32 @llvm.amdgcn.workitem.id.x() + %asv = getelementptr i8, ptr addrspace(5) %as, i32 %voff + %p1 = getelementptr i8, ptr addrspace(5) %asv, i32 -1 + store volatile i8 1, ptr addrspace(5) %p1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll index e1273e1a4bcd0..eb8c3cadc4997 100644 --- a/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll +++ b/llvm/test/CodeGen/AMDGPU/sink-image-sample.ll @@ -1,34 +1,143 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s -; Test that image.sample instruction is sunk across the branch and not left in the first block. Since the kill may terminate the shader there might be no need for sampling the image. +; Test that image.sample LOD(_L), Level 0(_LZ), Derivative(_D) instructions are sunk across the branch and not left in the first block. Since the kill may terminate the shader there might be no need for sampling the image. ; GCN-LABEL: {{^}}sinking_img_sample: -; GCN-NOT: image_sample +; GCN-NOT: image_sample_l v +; GCN-NOT: image_sample_lz v +; GCN-NOT: image_sample_c_lz v +; GCN-NOT: image_sample_c_l v +; GCN-NOT: image_sample_d v +; GCN-NOT: image_sample_c_d v +; GCN-NOT: image_sample_d_cl v +; GCN-NOT: image_sample_c_d_cl v ; GCN: branch -; GCN: image_sample +; GCN: image_sample_l v +; GCN: image_sample_lz v +; GCN: image_sample_c_lz v +; GCN: image_sample_c_l v +; GCN: image_sample_d v +; GCN: image_sample_c_d v +; GCN: image_sample_d_cl v +; GCN: image_sample_c_d_cl v ; GCN: exp null -define amdgpu_ps float @sinking_img_sample() { +define amdgpu_ps float @sinking_img_sample(i1 %cond) { main_body: - %i = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) - br i1 undef, label %endif1, label %if1 + %i1 = call <3 x float> @llvm.amdgcn.image.sample.l.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i2 = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32(i32 7, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i3 = call <3 x float> @llvm.amdgcn.image.sample.c.lz.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i4 = call <3 x float> @llvm.amdgcn.image.sample.c.l.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i5 = call <3 x float> @llvm.amdgcn.image.sample.d.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i6 = call <3 x float> @llvm.amdgcn.image.sample.c.d.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i7 = call <3 x float> @llvm.amdgcn.image.sample.d.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i8 = call <3 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + br i1 %cond, label %endif1, label %if1 if1: ; preds = %main_body call void @llvm.amdgcn.kill(i1 false) #4 br label %exit endif1: ; preds = %main_body - %i22 = extractelement <3 x float> %i, i32 2 + %i22 = extractelement <3 x float> %i1, i32 1 %i23 = call nsz arcp contract float @llvm.fma.f32(float %i22, float 0.000000e+00, float 0.000000e+00) #1 + %i24 = extractelement <3 x float> %i2, i32 1 + %i25 = call nsz arcp contract float @llvm.fma.f32(float %i23, float %i24, float 0.000000e+00) #1 + %i26 = extractelement <3 x float> %i3, i32 1 + %i27 = call nsz arcp contract float @llvm.fma.f32(float %i25, float %i26, float 0.000000e+00) #1 + %i28 = extractelement <3 x float> %i4, i32 1 + %i29 = call nsz arcp contract float @llvm.fma.f32(float %i27, float %i28, float 0.000000e+00) #1 + %i30 = extractelement <3 x float> %i5, i32 1 + %i31 = call nsz arcp contract float @llvm.fma.f32(float %i29, float %i30, float 0.000000e+00) #1 + %i32 = extractelement <3 x float> %i6, i32 1 + %i33 = call nsz arcp contract float @llvm.fma.f32(float %i31, float %i32, float 0.000000e+00) #1 + %i34 = extractelement <3 x float> %i7, i32 1 + %i35 = call nsz arcp contract float @llvm.fma.f32(float %i33, float %i34, float 0.000000e+00) #1 + %i36 = extractelement <3 x float> %i8, i32 1 + %i37 = call nsz arcp contract float @llvm.fma.f32(float %i35, float %i36, float 0.000000e+00) #1 br label %exit exit: ; preds = %endif1, %if1 - %i24 = phi float [ undef, %if1 ], [ %i23, %endif1 ] - ret float %i24 + %i38 = phi float [ poison, %if1 ], [ %i37, %endif1 ] + ret float %i38 } + + +; Test that image.sample instructions which use WQM are marked as Convergent and will be left in the first block. + +; GCN-LABEL: {{^}}no_sinking_img_sample: +; GCN: image_sample v +; GCN: image_sample_c v +; GCN: image_sample_cl v +; GCN: image_sample_c_cl v +; GCN: image_sample_b v +; GCN: image_sample_c_b v +; GCN: image_sample_b_cl v +; GCN: branch +; GCN-NOT: image_sample v +; GCN-NOT: image_sample_c v +; GCN-NOT: image_sample_cl v +; GCN-NOT: image_sample_c_cl v +; GCN-NOT: image_sample_b v +; GCN-NOT: image_sample_c_b v +; GCN-NOT: image_sample_b_cl v +; GCN: exp null + +define amdgpu_ps float @no_sinking_img_sample(i1 %cond) { +main_body: + %i1 = call <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 7, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i2 = call <3 x float> @llvm.amdgcn.image.sample.c.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i3 = call <3 x float> @llvm.amdgcn.image.sample.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i4 = call <3 x float> @llvm.amdgcn.image.sample.c.cl.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i5 = call <3 x float> @llvm.amdgcn.image.sample.b.2d.v3f32.f32(i32 7, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i6 = call <3 x float> @llvm.amdgcn.image.sample.c.b.2d.v3f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + %i7 = call <3 x float> @llvm.amdgcn.image.sample.b.cl.2d.v3f32.f32.f32(i32 7, float poison, float poison, float poison, float poison, <8 x i32> poison, <4 x i32> poison, i1 false, i32 0, i32 0) + br i1 %cond, label %endif1, label %if1 + +if1: ; preds = %main_body + call void @llvm.amdgcn.kill(i1 false) #4 + br label %exit + +endif1: ; preds = %main_body + %i22 = extractelement <3 x float> %i1, i32 2 + %i23 = call nsz arcp contract float @llvm.fma.f32(float %i22, float 0.000000e+00, float 0.000000e+00) #1 + %i24 = extractelement <3 x float> %i2, i32 2 + %i25 = call nsz arcp contract float @llvm.fma.f32(float %i23, float %i24, float 0.000000e+00) #1 + %i26 = extractelement <3 x float> %i3, i32 2 + %i27 = call nsz arcp contract float @llvm.fma.f32(float %i25, float %i26, float 0.000000e+00) #1 + %i28 = extractelement <3 x float> %i4, i32 2 + %i29 = call nsz arcp contract float @llvm.fma.f32(float %i27, float %i28, float 0.000000e+00) #1 + %i30 = extractelement <3 x float> %i5, i32 2 + %i31 = call nsz arcp contract float @llvm.fma.f32(float %i29, float %i30, float 0.000000e+00) #1 + %i32 = extractelement <3 x float> %i6, i32 2 + %i33 = call nsz arcp contract float @llvm.fma.f32(float %i31, float %i32, float 0.000000e+00) #1 + %i34 = extractelement <3 x float> %i7, i32 2 + %i35 = call nsz arcp contract float @llvm.fma.f32(float %i33, float %i34, float 0.000000e+00) #1 + br label %exit + +exit: ; preds = %endif1, %if1 + %i36 = phi float [ poison, %if1 ], [ %i35, %endif1 ] + ret float %i36 +} + ; Function Attrs: nounwind readonly willreturn declare <3 x float> @llvm.amdgcn.image.sample.2d.v3f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.cl.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.cl.2d.v3f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.b.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.b.2d.v3f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.b.cl.2d.v3f32.f32.(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v3f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.l.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.lz.2d.v3f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.l.2d.v3f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.d.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.d.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.d.cl.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 +declare <3 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v3f32.f32.f32(i32 immarg, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #3 ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn declare float @llvm.fma.f32(float, float, float) #2 diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index b919bf0605a12..2d84e87722951 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -2,10 +2,12 @@ ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12 %s ; @basic_smax_smin(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: basic_smax_smin: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: basic_smax_smin: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -76,11 +91,26 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) %src1.clamp = call i16 @llvm.smin.i16(i16 %src1.max, i16 255) - %insert.0 = insertelement <2 x i16> undef, i16 %src0.clamp, i32 0 + %insert.0 = insertelement <2 x i16> poison, i16 %src0.clamp, i32 0 %vec = insertelement <2 x i16> %insert.0, i16 %src1.clamp, i32 1 ret <2 x i16> %vec } @@ -128,6 +158,19 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; SDAG-GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; SDAG-GFX11-NEXT: s_endpgm ; +; SDAG-GFX12-LABEL: basic_smax_smin_sgpr: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; SDAG-GFX12-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, s2, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v1, s3, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; SDAG-GFX12-NEXT: global_store_b32 v2, v0, s[0:1] +; SDAG-GFX12-NEXT: s_endpgm +; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -192,6 +235,28 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: basic_smax_smin_sgpr: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s4, 0 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s5, 0xff +; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-GFX12-NEXT: s_max_i32 s2, s2, s4 +; GISEL-GFX12-NEXT: s_max_i32 s3, s3, s4 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s2, s2 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-GFX12-NEXT: s_min_i32 s2, s2, s5 +; GISEL-GFX12-NEXT: s_min_i32 s3, s3, s5 +; GISEL-GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-GFX12-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX12-NEXT: s_endpgm + %src0 = trunc i32 %src0ext to i16 %src1 = trunc i32 %src1ext to i16 %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) @@ -235,6 +300,19 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: basic_smin_smax: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: basic_smin_smax: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -265,6 +343,21 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smin_smax: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) %src1.min = call i16 @llvm.smin.i16(i16 %src1, i16 255) @@ -305,6 +398,19 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: basic_smin_smax_combined: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: basic_smin_smax_combined: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -335,6 +441,21 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smin_smax_combined: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) @@ -373,6 +494,18 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: vec_smax_smin: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: vec_smax_smin: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -400,6 +533,19 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: vec_smax_smin: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> ) %src.clamp = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src.max, <2 x i16> ) ret <2 x i16> %src.clamp @@ -449,6 +595,17 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; SDAG-GFX11-NEXT: s_endpgm ; +; SDAG-GFX12-LABEL: vec_smax_smin_sgpr: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; SDAG-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, s2, 0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-GFX12-NEXT: s_endpgm +; ; GISEL-VI-LABEL: vec_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_load_dword s2, s[4:5], 0x2c @@ -521,6 +678,30 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GISEL-GFX11-NEXT: s_endpgm +; +; GISEL-GFX12-LABEL: vec_smax_smin_sgpr: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, 0 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s4, s2 +; GISEL-GFX12-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX12-NEXT: s_max_i32 s3, s4, s3 +; GISEL-GFX12-NEXT: s_max_i32 s2, s2, 0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GISEL-GFX12-NEXT: s_sext_i32_i16 s3, 0xff00ff +; GISEL-GFX12-NEXT: s_sext_i32_i16 s4, s2 +; GISEL-GFX12-NEXT: s_ashr_i32 s2, s2, 16 +; GISEL-GFX12-NEXT: s_min_i32 s3, s4, s3 +; GISEL-GFX12-NEXT: s_min_i32 s2, s2, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GISEL-GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX12-NEXT: s_endpgm + %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> ) %src.clamp = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src.max, <2 x i16> ) store <2 x i16> %src.clamp, ptr addrspace(1) %out @@ -556,6 +737,18 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) { ; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; +; SDAG-GFX12-LABEL: vec_smin_smax: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: vec_smin_smax: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -583,9 +776,617 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) { ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: vec_smin_smax: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %src.min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src, <2 x i16> ) %src.clamp = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src.min, <2 x i16> ) ret <2 x i16> %src.clamp } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} +define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { +; SDAG-VI-LABEL: basic_smax_smin_bit_or: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_bit_or: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; SDAG-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: basic_smax_smin_bit_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_bit_or: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_bit_or: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_bit_or: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_bit_or: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) + %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) + %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) + %src1.clamp = call i16 @llvm.smin.i16(i16 %src1.max, i16 255) + %src0.and = and i16 %src0.clamp, 255 + %src1.shl = shl i16 %src1.clamp, 8 + %or = or i16 %src0.and, %src1.shl + ret i16 %or +} +define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { +; SDAG-VI-LABEL: basic_umax_umin_bit_or: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_u16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_umax_umin_bit_or: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff +; SDAG-GFX9-NEXT: v_min_u16_e32 v0, 0xff, v0 +; SDAG-GFX9-NEXT: v_min_u16_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: basic_umax_umin_bit_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_u16 v1, 0xff, v1 +; GFX11-NEXT: v_min_u16 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_umax_umin_bit_or: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_min_u16 v1, 0xff, v1 +; SDAG-GFX12-NEXT: v_min_u16 v0, 0xff, v0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_umax_umin_bit_or: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_min_u16_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_min_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_umax_umin_bit_or: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_min_u16_e32 v0, 0xff, v0 +; GISEL-GFX9-NEXT: v_min_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_umax_umin_bit_or: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_min_u16 v1, 0xff, v1 +; GISEL-GFX12-NEXT: v_min_u16 v0, 0xff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %src0.max = call i16 @llvm.umax.i16(i16 %src0, i16 0) + %src0.clamp = call i16 @llvm.umin.i16(i16 %src0.max, i16 255) + %src1.max = call i16 @llvm.umax.i16(i16 %src1, i16 0) + %src1.clamp = call i16 @llvm.umin.i16(i16 %src1.max, i16 255) + %src0.and = and i16 %src0.clamp, 255 + %src1.shl = shl i16 %src1.clamp, 8 + %or = or i16 %src0.and, %src1.shl + ret i16 %or +} +define i16 @basic_smax_smin_vec_cast(i16 %src0, i16 %src1) { +; SDAG-VI-LABEL: basic_smax_smin_vec_cast: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; SDAG-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_vec_cast: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) + %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) + %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) + %src1.clamp = call i16 @llvm.smin.i16(i16 %src1.max, i16 255) + %insert.0 = insertelement <2 x i16> undef, i16 %src0.clamp, i32 0 + %vec = insertelement <2 x i16> %insert.0, i16 %src1.clamp, i32 1 + %vec.trunc = trunc <2 x i16> %vec to <2 x i8> + %cast = bitcast <2 x i8> %vec.trunc to i16 + ret i16 %cast +} +define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { +; SDAG-VI-LABEL: basic_smax_smin_bit_shl: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_bit_shl: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX9-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: basic_smax_smin_bit_shl: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_i16 v1, v1, 0 +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_bit_shl: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_max_i16 v1, v1, 0 +; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_bit_shl: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_bit_shl: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX9-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_bit_shl: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_max_i16 v1, v1, 0 +; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) + %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) + %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) + %src1.shl = shl i16 %src1.max, 8 + %or = or i16 %src0.clamp, %src1.shl + ret i16 %or +} +define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_vec_input: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff +; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_vec_input: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v2, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v2 +; GISEL-VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v1, v0 +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %smin = call <2 x i16> @llvm.smin.v2i16(<2 x i16> , <2 x i16> %src) + %smed = call <2 x i16> @llvm.smax.v2i16(<2 x i16> , <2 x i16> %smin) + %vec.trunc = trunc <2 x i16> %smed to <2 x i8> + %cast = bitcast <2 x i8> %vec.trunc to i16 + ret i16 %cast +} +define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-VI-NEXT: v_max_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff +; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v0 +; GISEL-VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-VI-NEXT: v_or_b32_e32 v0, v1, v0 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v1, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + + %smax = call <2 x i16> @llvm.smax.v2i16(<2 x i16> , <2 x i16> %src) + %smed = call <2 x i16> @llvm.smin.v2i16(<2 x i16> , <2 x i16> %smax) + %vec.trunc = trunc <2 x i16> %smed to <2 x i8> + %cast = bitcast <2 x i8> %vec.trunc to i16 + ret i16 %cast +} + diff --git a/llvm/test/CodeGen/ARM/loopvectorize_pr33804.ll b/llvm/test/CodeGen/ARM/loopvectorize_pr33804.ll index 8038dad3fe92f..cee7fde89b070 100644 --- a/llvm/test/CodeGen/ARM/loopvectorize_pr33804.ll +++ b/llvm/test/CodeGen/ARM/loopvectorize_pr33804.ll @@ -26,7 +26,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en %next19.i.i = getelementptr inbounds %struct.CvNode1D, ptr %dst, i32 %i.1424.i.i, i32 1 store ptr %dst, ptr %next19.i.i, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1024 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -52,7 +52,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en %val.i.i = getelementptr inbounds %struct.CvNode1D2, ptr %arrayidx15.i.i1427, i32 0, i32 1 store float 0xC415AF1D80000000, ptr %val.i.i, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1024 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -79,7 +79,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en store float %loadf, ptr %dst.ptr, align 4 store ptr %loadp, ptr %dst.ptr.1, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1024 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -107,7 +107,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en store float %loadf, ptr %dst.ptr, align 4 store ptr %loadp, ptr %dst.ptr.1, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1024 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i diff --git a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll index dc710a5c288a7..1fbfd0a987d7a 100644 --- a/llvm/test/CodeGen/NVPTX/arithmetic-int.ll +++ b/llvm/test/CodeGen/NVPTX/arithmetic-int.ll @@ -317,3 +317,26 @@ define i16 @lshr_i16(i16 %a, i16 %b) { %ret = lshr i16 %a, %b ret i16 %ret } + +;; Immediate cases + +define i16 @srem_i16_ir(i16 %a) { +; CHECK: rem.s16 %rs{{[0-9]+}}, 12, %rs{{[0-9]+}} +; CHECK: ret + %ret = srem i16 12, %a + ret i16 %ret +} + +define i32 @udiv_i32_ir(i32 %a) { +; CHECK: div.u32 %r{{[0-9]+}}, 34, %r{{[0-9]+}} +; CHECK: ret + %ret = udiv i32 34, %a + ret i32 %ret +} + +define i64 @sub_i64_ir(i64 %a) { +; CHECK: sub.s64 %rd{{[0-9]+}}, 56, %rd{{[0-9]+}} +; CHECK: ret + %ret = sub i64 56, %a + ret i64 %ret +} diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll index 69ee6167a4d3e..3416420367beb 100644 --- a/llvm/test/CodeGen/NVPTX/fma.ll +++ b/llvm/test/CodeGen/NVPTX/fma.ll @@ -41,3 +41,17 @@ define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) { %d = call double @dummy_f64(double %b, double %c) ret double %d } + +define ptx_device float @f32_iir(float %x) { +; CHECK: fma.rn.f32 %f{{[0-9]+}}, 0f52E8D4A5, 0f4A52FC54, %f{{[0-9]+}}; +; CHECK: ret; + %r = call float @llvm.fma.f32(float 499999997952.0, float 3456789.0, float %x) + ret float %r +} + +define ptx_device float @f32_iii(float %x) { +; CHECK: mov.f32 %f{{[0-9]+}}, 0f41200000; +; CHECK: ret; + %r = call float @llvm.fma.f32(float 2.0, float 3.0, float 4.0) + ret float %r +} diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index accfbe4af0313..7ece0ccbd844e 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -6,7 +6,7 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: srem_i128( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<19>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<129>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases @@ -67,32 +67,29 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: or.b64 %rd72, %rd121, %rd122; ; CHECK-NEXT: setp.eq.s64 %p15, %rd72, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd66; -; CHECK-NEXT: mov.b32 %r10, 127; -; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd73, %rd4, %r11; -; CHECK-NEXT: mov.b32 %r12, 64; -; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd74, %rd3, %r13; +; CHECK-NEXT: sub.s32 %r10, 127, %r9; +; CHECK-NEXT: shl.b64 %rd73, %rd4, %r10; +; CHECK-NEXT: sub.s32 %r11, 64, %r10; +; CHECK-NEXT: shr.u64 %rd74, %rd3, %r11; ; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; -; CHECK-NEXT: mov.b32 %r14, 63; -; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd76, %rd3, %r15; -; CHECK-NEXT: setp.gt.s32 %p16, %r11, 63; +; CHECK-NEXT: sub.s32 %r12, 63, %r9; +; CHECK-NEXT: shl.b64 %rd76, %rd3, %r12; +; CHECK-NEXT: setp.gt.s32 %p16, %r10, 63; ; CHECK-NEXT: selp.b64 %rd126, %rd76, %rd75, %p16; -; CHECK-NEXT: shl.b64 %rd125, %rd3, %r11; +; CHECK-NEXT: shl.b64 %rd125, %rd3, %r10; ; CHECK-NEXT: mov.u64 %rd116, %rd119; ; CHECK-NEXT: @%p15 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd121; -; CHECK-NEXT: shr.u64 %rd79, %rd3, %r16; -; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd80, %rd4, %r18; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd121; +; CHECK-NEXT: shr.u64 %rd79, %rd3, %r13; +; CHECK-NEXT: sub.s32 %r14, 64, %r13; +; CHECK-NEXT: shl.b64 %rd80, %rd4, %r14; ; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; -; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd82, %rd4, %r19; -; CHECK-NEXT: setp.gt.s32 %p17, %r16, 63; +; CHECK-NEXT: add.s32 %r15, %r13, -64; +; CHECK-NEXT: shr.u64 %rd82, %rd4, %r15; +; CHECK-NEXT: setp.gt.s32 %p17, %r13, 63; ; CHECK-NEXT: selp.b64 %rd123, %rd82, %rd81, %p17; -; CHECK-NEXT: shr.u64 %rd124, %rd4, %r16; +; CHECK-NEXT: shr.u64 %rd124, %rd4, %r13; ; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; ; CHECK-NEXT: mov.b64 %rd116, 0; @@ -155,7 +152,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: urem_i128( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<17>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<115>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases @@ -205,32 +202,29 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: or.b64 %rd62, %rd107, %rd108; ; CHECK-NEXT: setp.eq.s64 %p13, %rd62, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd56; -; CHECK-NEXT: mov.b32 %r10, 127; -; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r11; -; CHECK-NEXT: mov.b32 %r12, 64; -; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r13; +; CHECK-NEXT: sub.s32 %r10, 127, %r9; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r10; +; CHECK-NEXT: sub.s32 %r11, 64, %r10; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r11; ; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; -; CHECK-NEXT: mov.b32 %r14, 63; -; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r15; -; CHECK-NEXT: setp.gt.s32 %p14, %r11, 63; +; CHECK-NEXT: sub.s32 %r12, 63, %r9; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r12; +; CHECK-NEXT: setp.gt.s32 %p14, %r10, 63; ; CHECK-NEXT: selp.b64 %rd112, %rd66, %rd65, %p14; -; CHECK-NEXT: shl.b64 %rd111, %rd41, %r11; +; CHECK-NEXT: shl.b64 %rd111, %rd41, %r10; ; CHECK-NEXT: mov.u64 %rd102, %rd105; ; CHECK-NEXT: @%p13 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd107; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r16; -; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r18; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd107; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r13; +; CHECK-NEXT: sub.s32 %r14, 64, %r13; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r14; ; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; -; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r19; -; CHECK-NEXT: setp.gt.s32 %p15, %r16, 63; +; CHECK-NEXT: add.s32 %r15, %r13, -64; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r15; +; CHECK-NEXT: setp.gt.s32 %p15, %r13, 63; ; CHECK-NEXT: selp.b64 %rd109, %rd72, %rd71, %p15; -; CHECK-NEXT: shr.u64 %rd110, %rd42, %r16; +; CHECK-NEXT: shr.u64 %rd110, %rd42, %r13; ; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; ; CHECK-NEXT: mov.b64 %rd102, 0; @@ -324,7 +318,7 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: sdiv_i128( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<19>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<122>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases @@ -386,32 +380,29 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115; ; CHECK-NEXT: setp.eq.s64 %p15, %rd73, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd67; -; CHECK-NEXT: mov.b32 %r10, 127; -; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd74, %rd2, %r11; -; CHECK-NEXT: mov.b32 %r12, 64; -; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd75, %rd1, %r13; +; CHECK-NEXT: sub.s32 %r10, 127, %r9; +; CHECK-NEXT: shl.b64 %rd74, %rd2, %r10; +; CHECK-NEXT: sub.s32 %r11, 64, %r10; +; CHECK-NEXT: shr.u64 %rd75, %rd1, %r11; ; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; -; CHECK-NEXT: mov.b32 %r14, 63; -; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd77, %rd1, %r15; -; CHECK-NEXT: setp.gt.s32 %p16, %r11, 63; +; CHECK-NEXT: sub.s32 %r12, 63, %r9; +; CHECK-NEXT: shl.b64 %rd77, %rd1, %r12; +; CHECK-NEXT: setp.gt.s32 %p16, %r10, 63; ; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p16; -; CHECK-NEXT: shl.b64 %rd118, %rd1, %r11; +; CHECK-NEXT: shl.b64 %rd118, %rd1, %r10; ; CHECK-NEXT: mov.u64 %rd109, %rd112; ; CHECK-NEXT: @%p15 bra $L__BB4_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd114; -; CHECK-NEXT: shr.u64 %rd80, %rd1, %r16; -; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd81, %rd2, %r18; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd114; +; CHECK-NEXT: shr.u64 %rd80, %rd1, %r13; +; CHECK-NEXT: sub.s32 %r14, 64, %r13; +; CHECK-NEXT: shl.b64 %rd81, %rd2, %r14; ; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81; -; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd83, %rd2, %r19; -; CHECK-NEXT: setp.gt.s32 %p17, %r16, 63; +; CHECK-NEXT: add.s32 %r15, %r13, -64; +; CHECK-NEXT: shr.u64 %rd83, %rd2, %r15; +; CHECK-NEXT: setp.gt.s32 %p17, %r13, 63; ; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p17; -; CHECK-NEXT: shr.u64 %rd117, %rd2, %r16; +; CHECK-NEXT: shr.u64 %rd117, %rd2, %r13; ; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1; ; CHECK-NEXT: mov.b64 %rd109, 0; @@ -466,7 +457,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: udiv_i128( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<17>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<107>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases @@ -516,32 +507,29 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100; ; CHECK-NEXT: setp.eq.s64 %p13, %rd62, 0; ; CHECK-NEXT: cvt.u32.u64 %r9, %rd56; -; CHECK-NEXT: mov.b32 %r10, 127; -; CHECK-NEXT: sub.s32 %r11, %r10, %r9; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r11; -; CHECK-NEXT: mov.b32 %r12, 64; -; CHECK-NEXT: sub.s32 %r13, %r12, %r11; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r13; +; CHECK-NEXT: sub.s32 %r10, 127, %r9; +; CHECK-NEXT: shl.b64 %rd63, %rd42, %r10; +; CHECK-NEXT: sub.s32 %r11, 64, %r10; +; CHECK-NEXT: shr.u64 %rd64, %rd41, %r11; ; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; -; CHECK-NEXT: mov.b32 %r14, 63; -; CHECK-NEXT: sub.s32 %r15, %r14, %r9; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r15; -; CHECK-NEXT: setp.gt.s32 %p14, %r11, 63; +; CHECK-NEXT: sub.s32 %r12, 63, %r9; +; CHECK-NEXT: shl.b64 %rd66, %rd41, %r12; +; CHECK-NEXT: setp.gt.s32 %p14, %r10, 63; ; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p14; -; CHECK-NEXT: shl.b64 %rd103, %rd41, %r11; +; CHECK-NEXT: shl.b64 %rd103, %rd41, %r10; ; CHECK-NEXT: mov.u64 %rd94, %rd97; ; CHECK-NEXT: @%p13 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r16, %rd99; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r16; -; CHECK-NEXT: sub.s32 %r18, %r12, %r16; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r18; +; CHECK-NEXT: cvt.u32.u64 %r13, %rd99; +; CHECK-NEXT: shr.u64 %rd69, %rd41, %r13; +; CHECK-NEXT: sub.s32 %r14, 64, %r13; +; CHECK-NEXT: shl.b64 %rd70, %rd42, %r14; ; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; -; CHECK-NEXT: add.s32 %r19, %r16, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r19; -; CHECK-NEXT: setp.gt.s32 %p15, %r16, 63; +; CHECK-NEXT: add.s32 %r15, %r13, -64; +; CHECK-NEXT: shr.u64 %rd72, %rd42, %r15; +; CHECK-NEXT: setp.gt.s32 %p15, %r13, 63; ; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p15; -; CHECK-NEXT: shr.u64 %rd102, %rd42, %r16; +; CHECK-NEXT: shr.u64 %rd102, %rd42, %r13; ; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1; ; CHECK-NEXT: mov.b64 %rd94, 0; diff --git a/llvm/test/CodeGen/NVPTX/shift-parts.ll b/llvm/test/CodeGen/NVPTX/shift-parts.ll index c7cfdc4ff2a4d..ded1046714fd5 100644 --- a/llvm/test/CodeGen/NVPTX/shift-parts.ll +++ b/llvm/test/CodeGen/NVPTX/shift-parts.ll @@ -4,7 +4,6 @@ ; CHECK: shift_parts_left_128 define void @shift_parts_left_128(ptr %val, ptr %amtptr) { ; CHECK: shl.b64 -; CHECK: mov.b32 ; CHECK: sub.s32 ; CHECK: shr.u64 ; CHECK: or.b64 diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll index 7c1fab9bfe91a..f777c450bc106 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll @@ -400,33 +400,21 @@ declare void @llvm.masked.scatter.nxv1i64.nxv1p0(, @llvm.masked.gather.nxv1i64.nxv1p0(, i32, , ) -; TODO: Make the step loop variant to reflect what the loop vectorizer will emit -; in an EVL tail folding configuration. - define @vp_gather(ptr %a, i32 %len) { ; CHECK-LABEL: @vp_gather( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.stepvector.nxv1i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] -; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) -; CHECK-NEXT: [[ODD:%.*]] = and [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[MASK:%.*]] = icmp ne [[ODD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 -; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP2]], i64 16, [[MASK]], i32 [[EVL]]) +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP2]], i64 16, splat (i1 true), i32 42) ; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[TMP0]] ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[TMP0]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: @@ -444,15 +432,8 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] %accum = phi [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ] - - %elems = sub i64 %wide.trip.count, %index - %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true) - - %odd = and %vec.ind, splat (i64 1) - %mask = icmp ne %odd, splat (i64 0) - %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 - %gather = call @llvm.vp.gather( %2, %mask, i32 %evl) + %gather = call @llvm.vp.gather( %2, splat (i1 true), i32 42) %accum.next = add %accum, %gather %index.next = add nuw i64 %index, %0 %vec.ind.next = add %vec.ind, %.splat @@ -463,31 +444,19 @@ for.cond.cleanup: ; preds = %vector.body ret %accum.next } -; TODO: Make the step loop variant to reflect what the loop vectorizer will emit -; in an EVL tail folding configuration. - define void @vp_scatter(ptr %a, i32 %len) { ; CHECK-LABEL: @vp_scatter( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 ; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.stepvector.nxv1i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] -; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) -; CHECK-NEXT: [[ODD:%.*]] = and [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[MASK:%.*]] = icmp ne [[ODD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3 -; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64( zeroinitializer, ptr [[TMP2]], i64 16, [[MASK]], i32 [[EVL]]) +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i64( zeroinitializer, ptr [[TMP2]], i64 16, splat (i1 true), i32 42) ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[TMP0]] ; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[TMP0]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] ; CHECK: for.cond.cleanup: @@ -504,17 +473,120 @@ vector.ph: vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 + tail call void @llvm.vp.scatter( zeroinitializer, %2, splat (i1 true), i32 42) + %index.next = add nuw i64 %index, %0 + %vec.ind.next = add %vec.ind, %.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; Test that reflects what the loop vectorizer will generate for an EVL tail +; folded loop + +define @evl_gather(ptr %a, i32 %len) { +; CHECK-LABEL: @evl_gather( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 +; CHECK-NEXT: [[GATHER:%.*]] = call @llvm.vp.gather.nxv1i64.nxv1p0( [[TMP2]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[ACCUM_NEXT]] = add [[ACCUM]], [[GATHER]] +; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[EVL_ZEXT]] +; CHECK-NEXT: [[EVL_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_ZEXT]], i64 0 +; CHECK-NEXT: [[EVL_SPLAT:%.*]] = shufflevector [[EVL_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[EVL_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret [[ACCUM_NEXT]] +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %1 = tail call @llvm.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + %accum = phi [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ] %elems = sub i64 %wide.trip.count, %index %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true) - %odd = and %vec.ind, splat (i64 1) - %mask = icmp ne %odd, splat (i64 0) + %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 + %gather = call @llvm.vp.gather( %2, splat (i1 true), i32 %evl) + %accum.next = add %accum, %gather + + %evl.zext = zext i32 %evl to i64 + %index.next = add nuw i64 %index, %evl.zext + %evl.splatinsert = insertelement poison, i64 %evl.zext, i64 0 + %evl.splat = shufflevector %evl.splatinsert, poison, zeroinitializer + %vec.ind.next = add %vec.ind, %evl.splat + %3 = icmp ne i64 %index.next, %wide.trip.count + br i1 %3, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret %accum.next +} + +; Test that reflects what the loop vectorizer will generate for an EVL tail +; folded loop + +define void @evl_scatter(ptr %a, i32 %len) { +; CHECK-LABEL: @evl_scatter( +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], [[VEC_IND]], i32 3 +; CHECK-NEXT: tail call void @llvm.vp.scatter.nxv1i64.nxv1p0( zeroinitializer, [[TMP1]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]] +; CHECK-NEXT: [[EVL_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_ZEXT]], i64 0 +; CHECK-NEXT: [[EVL_SPLAT:%.*]] = shufflevector [[EVL_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[EVL_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +vector.ph: + %wide.trip.count = zext i32 %len to i64 + %1 = tail call @llvm.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ] + + %elems = sub i64 %wide.trip.count, %index + %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 1, i1 true) %2 = getelementptr inbounds %struct.foo, ptr %a, %vec.ind, i32 3 - tail call void @llvm.vp.scatter( zeroinitializer, %2, %mask, i32 %evl) - %index.next = add nuw i64 %index, %0 - %vec.ind.next = add %vec.ind, %.splat + tail call void @llvm.vp.scatter( zeroinitializer, %2, splat (i1 true), i32 %evl) + + %evl.zext = zext i32 %evl to i64 + %index.next = add nuw i64 %index, %evl.zext + %evl.splatinsert = insertelement poison, i64 %evl.zext, i64 0 + %evl.splat = shufflevector %evl.splatinsert, poison, zeroinitializer + %vec.ind.next = add %vec.ind, %evl.splat %3 = icmp ne i64 %index.next, %wide.trip.count br i1 %3, label %for.cond.cleanup, label %vector.body diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll index 9642e5e4c9f86..26af46263c0e2 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -699,16 +699,13 @@ define <16 x i8> @evenelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwi ; ; AVX2-LABEL: evenelts_v32i16_shuffle_v16i16_to_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4] +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -783,16 +780,13 @@ define <16 x i8> @oddelts_v32i16_shuffle_v16i16_to_v16i8(<32 x i16> %n2) nounwin ; ; AVX2-LABEL: oddelts_v32i16_shuffle_v16i16_to_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,4] +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index f0f02f1ed890a..ec442c185706c 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -275,53 +275,45 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) { ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13] +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13] +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 +; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/trunc-vector-width.ll b/llvm/test/CodeGen/X86/trunc-vector-width.ll index bc6969c5cd37a..42cc624b5a535 100644 --- a/llvm/test/CodeGen/X86/trunc-vector-width.ll +++ b/llvm/test/CodeGen/X86/trunc-vector-width.ll @@ -4,14 +4,16 @@ define void @test(ptr %a0) #0 { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqu (%rdi), %xmm0 -; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,5,5,0,0,1,1,u,u,u,u,u,u,u,u] -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqu (%rdi), %ymm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,0,0] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = mem[0],ymm0[1,2,3,4,5,6,7] +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; CHECK-NEXT: vpextrb $1, %xmm0, (%rax) ; CHECK-NEXT: vpextrb $4, %xmm0, (%rax) ; CHECK-NEXT: vpextrb $8, %xmm0, (%rax) +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %load = load <64 x i8>, ptr %a0, align 1 %shuf = shufflevector <64 x i8> %load, <64 x i8> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index 01181d4b21d9d..abef980277ece 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -849,146 +849,122 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i8_stride4_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm6 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vmovdqa %xmm4, (%rsi) -; AVX2-NEXT: vmovdqa %xmm5, (%rdx) -; AVX2-NEXT: vmovdqa %xmm6, (%rcx) +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-NEXT: vmovdqa %xmm5, (%rcx) ; AVX2-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride4_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-FP-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm6 +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx) -; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx) +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-FP-NEXT: vmovdqa %xmm5, (%rcx) ; AVX2-FP-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride4_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm6 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rcx) +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride4_vf16: @@ -1446,228 +1422,198 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i8_stride4_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm9 +; AVX2-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9 +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm12 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-NEXT: vpermd %ymm11, %ymm6, %ymm11 -; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm7, (%rsi) -; AVX2-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-NEXT: vmovdqa %ymm6, (%rdx) +; AVX2-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride4_vf32: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-FP-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-FP-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-FP-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX2-FP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm7 +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-FP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-FP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-FP-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-FP-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm5, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-FP-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm8 +; AVX2-FP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm9 +; AVX2-FP-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 +; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm12 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-FP-NEXT: vpermd %ymm11, %ymm6, %ymm11 -; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FP-NEXT: vmovdqa %ymm7, (%rsi) -; AVX2-FP-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-FP-NEXT: vmovdqa %ymm6, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride4_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm9 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm12 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm11 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rsi) -; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq @@ -2696,517 +2642,379 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i8_stride4_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $168, %rsp -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm13, %xmm5, %xmm8 -; AVX2-NEXT: vpshufb %xmm13, %xmm4, %xmm9 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; AVX2-NEXT: vpshufb %ymm13, %ymm2, %ymm9 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] -; AVX2-NEXT: vpermd %ymm9, %ymm1, %ymm9 -; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm1, %ymm10 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 176(%rdi), %xmm8 -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm10 -; AVX2-NEXT: vmovdqa 160(%rdi), %xmm9 -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; AVX2-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX2-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm11 -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-NEXT: vpshufb %ymm13, %ymm11, %ymm14 -; AVX2-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-NEXT: vmovdqa %xmm5, %xmm10 -; AVX2-NEXT: vpshufb %xmm15, %xmm4, %xmm2 -; AVX2-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm15, %ymm6, %ymm2 -; AVX2-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm15, %ymm5, %ymm13 -; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %xmm8, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm0 -; AVX2-NEXT: vmovdqa %xmm9, %xmm7 -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm15, %xmm8, %xmm2 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm15, %xmm9, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-NEXT: vpshufb %ymm15, %ymm11, %ymm2 -; AVX2-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm13 -; AVX2-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-NEXT: vpshufb %ymm3, %ymm5, %ymm15 -; AVX2-NEXT: vpermd %ymm15, %ymm1, %ymm15 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm5 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vpshufb %ymm10, %ymm7, %ymm11 +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-NEXT: vpshufb %ymm10, %ymm6, %ymm12 +; AVX2-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm10, %ymm4, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX2-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm12 +; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-NEXT: vpermd %ymm10, %ymm3, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm12, %ymm9, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-NEXT: vpshufb %ymm12, %ymm8, %ymm13 +; AVX2-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpshufb %ymm12, %ymm7, %ymm13 +; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-NEXT: vpshufb %ymm12, %ymm6, %ymm14 +; AVX2-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX2-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-NEXT: vpshufb %ymm12, %ymm2, %ymm14 +; AVX2-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-NEXT: vpshufb %ymm12, %ymm1, %ymm14 +; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX2-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm14, %ymm9, %ymm13 +; AVX2-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-NEXT: vpshufb %ymm14, %ymm8, %ymm15 +; AVX2-NEXT: vpermd %ymm15, %ymm3, %ymm15 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-NEXT: vpshufb %ymm3, %ymm11, %ymm2 -; AVX2-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX2-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm6, %xmm10, %xmm5 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm14, %ymm7, %ymm15 +; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; AVX2-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX2-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm14, %ymm4, %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vpshufb %ymm14, %ymm1, %ymm15 +; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-NEXT: vpshufb %ymm14, %ymm0, %ymm14 +; AVX2-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX2-NEXT: vpermd %ymm9, %ymm3, %ymm9 +; AVX2-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm3, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-NEXT: vpermd %ymm6, %ymm3, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-NEXT: vmovdqa %ymm3, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm12, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm11, (%rdx) +; AVX2-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-NEXT: addq $168, %rsp +; AVX2-NEXT: vmovdqa %ymm6, (%r8) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: load_i8_stride4_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: subq $168, %rsp -; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm9 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm2, %ymm9 -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] -; AVX2-FP-NEXT: vpermd %ymm9, %ymm1, %ymm9 -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm10 -; AVX2-FP-NEXT: vpermd %ymm10, %ymm1, %ymm10 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm8 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm10 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm9 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm11 -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm11, %ymm14 -; AVX2-FP-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm10 -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm4, %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm6, %ymm2 -; AVX2-FP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm5, %ymm13 -; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm7 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm15, %xmm9, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm11, %ymm2 -; AVX2-FP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FP-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm6, %ymm13 -; AVX2-FP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm15 -; AVX2-FP-NEXT: vpermd %ymm15, %ymm1, %ymm15 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm5 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] +; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX2-FP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm11 +; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm6, %ymm12 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm11 +; AVX2-FP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm12 +; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX2-FP-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-FP-NEXT: vpermd %ymm10, %ymm3, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm9, %ymm11 +; AVX2-FP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm8, %ymm13 +; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm7, %ymm13 +; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm6, %ymm14 +; AVX2-FP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm14 +; AVX2-FP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm1, %ymm14 +; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm9, %ymm13 +; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm2 -; AVX2-FP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX2-FP-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm6 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 -; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FP-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-FP-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm7, %ymm15 +; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm4, %ymm5 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm15 +; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm14 +; AVX2-FP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX2-FP-NEXT: vpermd %ymm9, %ymm3, %ymm9 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FP-NEXT: vpermd %ymm8, %ymm3, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpermd %ymm6, %ymm3, %ymm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FP-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm12, 32(%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm11, (%rdx) +; AVX2-FP-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-FP-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-FP-NEXT: addq $168, %rsp +; AVX2-FP-NEXT: vmovdqa %ymm6, (%r8) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: load_i8_stride4_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $168, %rsp -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm9 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm9 -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,4,0,4,0,4,0,4] -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm10 -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm10 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm14 -; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm13 -; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm13 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm15 -; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm15 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm5 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,4,0,4,0,4,0,4] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm11 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm11 +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm12 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm3, %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm11 +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm13 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm14 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm2 -; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm6 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm15 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm15 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm3, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm3, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm12, 32(%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm14, 32(%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm13, (%rcx) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r8) -; AVX2-FCP-NEXT: addq $168, %rsp +; AVX2-FCP-NEXT: vmovdqa %ymm6, (%r8) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index e7bb02db62753..ac14f55e3f0ed 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -6395,203 +6395,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i8_stride5_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm24 -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm22 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm23 ; AVX512-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm12 -; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) -; AVX512-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) +; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] +; AVX512-NEXT: vpermd %ymm12, %ymm17, %ymm15 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13 -; AVX512-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12)) -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15)) -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 +; AVX512-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 ; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 +; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1)) -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 ; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 +; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] -; AVX512-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512-NEXT: vpermd %ymm4, %ymm17, %ymm4 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: vzeroupper @@ -6600,203 +6600,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i8_stride5_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) -; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) +; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13 -; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12)) -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 +; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-FCP-NEXT: vzeroupper @@ -6805,203 +6805,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i8_stride5_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm24 -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm22 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm23 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) -; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) +; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] +; AVX512DQ-NEXT: vpermd %ymm12, %ymm17, %ymm15 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13 -; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12)) -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 +; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] -; AVX512DQ-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vpermd %ymm4, %ymm17, %ymm4 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-NEXT: vzeroupper @@ -7010,203 +7010,203 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm24 ^ (ymm7 & (ymm23 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm19) | ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm25 ^ (ymm9 & (ymm7 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (mem & (ymm9 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ~ymm16) | ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm20 & (zmm10 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm12 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm7 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm8 ^ (mem & (ymm2 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm16 & (ymm3 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm23 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm19) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm15 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm12 ^ ymm15)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm25 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (mem & (ymm1 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm22 ^ ymm21)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm24 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm15 ^ (ymm4 & (ymm12 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm7 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (mem & (ymm10 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm24 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm22 ^ (ymm4 & (ymm21 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper @@ -7231,163 +7231,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm10 +; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm9 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] ; AVX512BW-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] +; AVX512BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 ; AVX512BW-NEXT: movl $127, %eax ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} -; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u] -; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] +; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512BW-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 -; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11 -; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} -; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} +; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] +; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] ; AVX512BW-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm10 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm15, %ymm10 -; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] -; AVX512BW-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14 -; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 +; AVX512BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero +; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] +; AVX512BW-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} +; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] +; AVX512BW-NEXT: vporq %xmm17, %xmm16, %xmm16 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] ; AVX512BW-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm11 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm9 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3} -; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 +; AVX512BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -7400,36 +7401,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512BW-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] -; AVX512BW-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512BW-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -7453,163 +7453,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm10 +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] +; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 ; AVX512BW-FCP-NEXT: movl $127, %eax ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 ; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm11 -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm10 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm10 -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14 -; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 ; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm11 -; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 +; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 +; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -7622,36 +7623,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -7675,163 +7675,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm10 +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm9 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-NEXT: vpermd %ymm8, %ymm19, %ymm8 ; AVX512DQ-BW-NEXT: movl $127, %eax ; AVX512DQ-BW-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] -; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] +; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 ; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u] -; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] +; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11 -; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQ-BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} -; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm10 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm15, %ymm10 -; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14 -; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm16, %ymm4 +; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5} -; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} +; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm16, %xmm16 ; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm11 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm18, %xmm9 +; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512DQ-BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 +; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -7844,36 +7845,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512DQ-BW-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm19, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -7897,163 +7897,164 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm9 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm19 = [0,5,0,5,0,5,0,5] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm19, %ymm8 ; AVX512DQ-BW-FCP-NEXT: movl $127, %eax ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20 ; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,zero,zero,xmm17[4,9,14],zero,zero,zero,xmm17[2,7,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,0,5,10,15],zero,zero,zero,xmm16[3,8,13],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm15[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k3} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[1,6,11],zero,zero,zero,zero,xmm17[4,9,14,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15],zero,zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm15[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm16 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm16, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm8, %ymm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm15[u,u,u,1,6,11],zero,zero,zero,zero,xmm15[4,9,14],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[3,8,13] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm4 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm4 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[3,8,13],zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[2,7,12],zero,zero,zero,xmm16[0,5,10,15,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 ; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u] @@ -8066,36 +8067,35 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX512DQ-BW-FCP-NEXT: movl $2114, %eax # imm = 0x842 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 9ce685f13e476..f87126a98eea4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -7354,12 +7354,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i8_stride6_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $40, %rsp -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7608,12 +7608,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-LABEL: load_i8_stride6_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: subq $40, %rsp -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -7862,12 +7862,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-LABEL: load_i8_stride6_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $40, %rsp -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -8116,12 +8116,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-LABEL: load_i8_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: subq $40, %rsp -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm25 ^ (ymm0 & (ymm26 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 @@ -8370,12 +8370,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i8_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm23 ; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-NEXT: kmovd %r10d, %k1 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm12 @@ -8606,12 +8606,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-LABEL: load_i8_stride6_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23 ; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 @@ -8842,12 +8842,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-LABEL: load_i8_stride6_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm23 ; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm12 @@ -9078,12 +9078,12 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-LABEL: load_i8_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23 ; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index bea6219b9fbac..5ab09194c5b83 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -12121,414 +12121,399 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i8_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm10 ^ (ymm1 & (ymm19 ^ ymm10)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm27 ^ ymm30)) -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7,8,9],ymm9[10],ymm2[11,12],ymm9[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm1 & mem) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm29 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm31 ^ (ymm1 & (ymm29 ^ ymm31)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm31 ^ ymm27)) +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 +; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm30 ^ ymm28)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm2 & (zmm20 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %ymm26 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm26 ^ ymm11)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm13 -; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm16 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm15 ^ (ymm5 & (ymm16 ^ ymm15)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7,8,9],ymm14[10],ymm5[11,12,13],ymm14[14],ymm5[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm13 & ymm21) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm10 ^ (ymm13 & (ymm19 ^ ymm10)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm30 ^ ymm27)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7,8,9],ymm9[10],ymm13[11,12,13],ymm9[14],ymm13[15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm21 & (zmm8 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16 +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm11 ^ ymm16)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm14 ^ (ymm7 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7,8,9],ymm13[10],ymm7[11,12,13],ymm13[14],ymm7[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm0 & ymm26) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17 +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11],zero,zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm23 & (ymm15 ^ ymm13)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm27 ^ ymm31)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7,8,9],ymm6[10],ymm13[11,12,13],ymm6[14],ymm13[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm3 & ~mem) -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm31 ^ (ymm3 & (ymm29 ^ ymm31)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6] -; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm23 -; AVX512-FCP-NEXT: vmovdqa %xmm0, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ~mem) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm30 ^ ymm28)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm15[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm21 & (zmm7 ^ zmm13)) +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm11 ^ (ymm8 & (ymm16 ^ ymm11)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u] +; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10] +; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm23 & (ymm8 ^ ymm13)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm22 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm2 & (zmm22 ^ zmm13)) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm19 ^ ymm10)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm30 ^ ymm27)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm29 ^ (ymm2 & (ymm31 ^ ymm29)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6] -; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,255,18446744073709486080,18446744073709551615] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm25) +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm13, %ymm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm22 & (zmm3 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm11 ^ (ymm7 & (ymm16 ^ ymm11)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm23 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm18 & (zmm23 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm19 ^ ymm10)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm30 ^ ymm27)) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm29 ^ (ymm2 & (ymm31 ^ ymm29)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512-FCP-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm7)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm14 ^ ymm2)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5,6],ymm13[7,8],ymm8[9,10],ymm13[11],ymm8[12,13,14],ymm13[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm26) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm24 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm25 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm3 ^ (zmm18 & (zmm28 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm5)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm20)) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm11 ^ ymm26)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm15 ^ (ymm7 & (ymm16 ^ ymm15)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm21) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm7)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm5 & (zmm20 ^ zmm22)) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm11 ^ ymm26)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm15 ^ ymm16)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3],ymm7[4,5,6],ymm8[7,8],ymm7[9,10],ymm8[11],ymm7[12,13,14],ymm8[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm21) -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm7)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm5 & (zmm22 ^ zmm23)) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm15 ^ ymm16)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3],ymm7[4],ymm3[5,6],ymm7[7,8],ymm3[9,10,11],ymm7[12],ymm3[13,14],ymm7[15] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm26 ^ (ymm3 & (ymm11 ^ ymm26)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm8)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm21 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm20 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm21) | ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm0 & (ymm2 ^ ymm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm5 & (zmm23 ^ zmm28)) -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm15 ^ ymm16)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5,6],ymm6[7,8],ymm7[9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm25) ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm26 ^ ymm11)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm21) | ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm0 & (ymm4 ^ ymm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm16 ^ ymm15)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm26 ^ ymm11)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm3)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm22 & (zmm0 ^ zmm7)) +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6],ymm4[7,8],ymm3[9,10,11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm16 ^ ymm11)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u] +; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero ; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm4)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm14 ^ ymm2)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm26) | ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero +; AVX512-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm23 & (ymm29 ^ ymm3)) ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm28 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm0 & (ymm28 ^ ymm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm10 ^ ymm19)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = ymm24 ^ (ymm21 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16)) ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm19 ^ (ymm6 & (ymm10 ^ ymm19)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm27 ^ (ymm7 & (ymm30 ^ ymm27)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm11 ^ (ymm12 & (ymm26 ^ ymm11)) -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm30 ^ (ymm6 & (ymm27 ^ ymm30)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm27 ^ ymm30)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm26 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm23 & (ymm26 ^ ymm3)) +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm20 ^ (ymm13 & (ymm12 ^ ymm20)) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17)) ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3],ymm9[4],ymm7[5,6],ymm9[7,8],ymm7[9,10,11],ymm9[12],ymm7[13,14],ymm9[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ymm17) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm3 & ymm17) -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ymm17) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm29 ^ (ymm10 & (ymm31 ^ ymm29)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm12 ^ ymm20)) +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm20 ^ (ymm10 & (ymm12 ^ ymm20)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm9 & (ymm11 ^ ymm16)) +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm31 ^ ymm27)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[4,11],zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6],ymm6[7,8],ymm7[9,10,11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm3 & ymm25) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13,14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm0 & ymm25) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7,8],ymm6[9],ymm9[10,11,12],ymm6[13],ymm9[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm25) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm1 & (ymm2 ^ ymm14)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm28 ^ ymm30)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm11 & (ymm3 ^ ymm2)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm15 ^ (ymm14 & (ymm16 ^ ymm15)) -; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm29 ^ (ymm13 & (ymm31 ^ ymm29)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm29 ^ ymm31)) -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm9 & (ymm3 ^ ymm0)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm9 & (ymm4 ^ ymm0)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm11 & (ymm5 ^ ymm9)) -; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm13 & (ymm11 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm9 & (ymm5 ^ ymm1)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,2,4,6,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm10)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,4,6,0,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm3 & (zmm4 ^ zmm7)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,6,0,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm6)) -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm3 & (zmm6 ^ zmm7)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm3 ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm16[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] -; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm26, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -12961,413 +12946,405 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm11 ^ (ymm1 & (ymm26 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: pushq %rax +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm29 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7,8,9],ymm9[10],ymm2[11,12],ymm9[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm1 & mem) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm24 ^ (ymm1 & (ymm31 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm27 ^ (ymm2 & (ymm31 ^ ymm27)) +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6] -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm2 & (zmm20 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm10 ^ (ymm4 & (ymm19 ^ ymm10)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm15 ^ (ymm5 & (ymm16 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7,8,9],ymm14[10],ymm5[11,12,13],ymm14[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm13 & ymm23) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm26 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm29 ^ (ymm13 & (ymm30 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7,8,9],ymm9[10],ymm13[11,12,13],ymm9[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm3 & ~mem) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm31 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm26 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm30 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm24 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm22 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm3 ^ (zmm18 & (zmm22 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm26 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm1 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm30 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm24 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm13, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4,5],ymm8[6],ymm13[7,8,9],ymm8[10],ymm13[11,12,13],ymm8[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm10 & ymm26) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm17 ^ (ymm13 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm21 & (ymm13 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7,8,9],ymm3[10],ymm8[11,12,13],ymm3[14],ymm8[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ~mem) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm28 ^ (ymm7 & (ymm30 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,4,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm16 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10] +; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (mem & (ymm8 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm21 & (ymm8 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8,9,10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm23 = [18446744073709551615,255,18446744073709486080,18446744073709551615] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm23) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,5,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm8, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm13 ^ (zmm22 & (zmm3 ^ zmm13)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm16 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm3 ^ (zmm18 & (zmm28 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm27 & (ymm3 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm20)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm14 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5,6],ymm15[7,8],ymm13[9,10],ymm15[11],ymm13[12,13,14],ymm15[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm26) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm15 ^ (ymm7 & (ymm16 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm21 & (ymm7 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm29 & (zmm20 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm27 & (ymm3 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm5 & (zmm20 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm15 ^ ymm16)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3],ymm7[4,5,6],ymm8[7,8],ymm7[9,10],ymm8[11],ymm7[12,13,14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm27 & (ymm3 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm5 & (zmm21 ^ zmm22)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm15 ^ ymm16)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3],ymm7[4],ymm3[5,6],ymm7[7,8],ymm3[9,10,11],ymm7[12],ymm3[13,14],ymm7[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm10 ^ ymm19)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm23) | ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm25 & (ymm3 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm22 & (zmm3 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm14 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm16 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm26) | ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm17 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm27 & (ymm2 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm5 & (zmm22 ^ zmm28)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm15 ^ ymm16)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm19 ^ ymm10)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm23) | ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm27 & (ymm1 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm16 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm19 ^ ymm10)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm26 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm21 & (ymm26 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm14 ^ (ymm3 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm28 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm27 & (ymm28 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm26 ^ (ymm2 & (ymm11 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm2, %xmm27 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm23 = ymm24 ^ (ymm23 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm26 ^ (ymm2 & (ymm11 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ~mem) | ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm11 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm30 ^ ymm29)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm10 ^ (ymm12 & (ymm19 ^ ymm10)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm29 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm29 ^ ymm30)) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2,3],ymm9[4],ymm3[5,6],ymm9[7,8],ymm3[9,10,11],ymm9[12],ymm3[13,14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm27 & ymm17) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ymm17) -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm6 & ymm17) -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm0 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm29 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm21 & (ymm29 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm12 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm12 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm11 ^ (ymm6 & (ymm12 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm16 ^ (ymm9 & (ymm1 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm27 ^ (ymm15 & (ymm31 ^ ymm27)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vporq %xmm11, %xmm0, %xmm16 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6],ymm4[7,8],ymm8[9,10,11],ymm4[12],ymm8[13,14],ymm4[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ymm23) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm3 & ymm23) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7,8],ymm4[9],ymm9[10,11,12],ymm4[13],ymm9[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm16 & ymm23) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm10 & (ymm2 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm28 ^ (ymm10 & (ymm30 ^ ymm28)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm9, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm11 & (ymm8 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm15 ^ (ymm14 & (ymm16 ^ ymm15)) -; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm0 ^ ymm31)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm0 ^ (ymm14 & (ymm31 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm8 & (zmm2 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm25 & (ymm9 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm7, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm11 & (ymm5 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm25 & (ymm7 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm13 & (ymm11 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm8 & (zmm5 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm25 & (ymm10 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,2,4,6,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,4,6,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm11, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,5,6,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm0)) ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm28, %zmm0, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm16[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm26, %zmm0, %zmm3 {%k1} +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm23, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-FCP-NEXT: popq %rax ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -13743,29 +13720,29 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm4 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm4 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} ; AVX512BW-FCP-NEXT: kmovq %k1, %k2 ; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 @@ -13776,11 +13753,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm5 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm5 {%k1} ; AVX512BW-FCP-NEXT: kmovq %k1, %k3 ; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -13789,285 +13766,285 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm6, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5 ; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-FCP-NEXT: kmovq %rax, %k5 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm1 {%k5} -; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 ; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm9 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm9[u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm21 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-FCP-NEXT: kmovd %eax, %k7 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm21 {%k7} -; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm4, %ymm20 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm22 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm18 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 ; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23 +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm10 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6] -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm10 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm15 +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6] -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm19, %xmm14 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 -; AVX512BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000 -; AVX512BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 ; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm4, %ymm18 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] -; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm4, %ymm17 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm0 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm9, %ymm16 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k1} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm16 {%k2} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm0 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm14 {%k2} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm9, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm9, %ymm0 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm0 {%k4} -; AVX512BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm21 {%k6} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm21, %xmm13 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm13 {%k3} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm11, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm11, %xmm11 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm19, %ymm11 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k3} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k3} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm21 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm21, %zmm21 -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm0, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm21[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm12, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm2 {%k5} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm13, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k5} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 +; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k5} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm5 {%k1} +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u] @@ -14075,13 +14052,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm4 {%k2} ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero @@ -14094,11 +14071,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rdi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -14472,29 +14449,29 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k2 ; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 @@ -14505,11 +14482,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k3 ; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 @@ -14518,315 +14495,315 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm6, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm9 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm9[u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm7 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm21 {%k7} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm5, %ymm20 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm18 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 ; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm10 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm19, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 -; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2 +; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm0 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm5, %ymm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm5, %ymm17 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm0 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm9, %ymm16 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm15 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm0 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm9, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm9, %ymm0 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm21 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm0 {%k6} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm13, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm19, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm13, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm19, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm21, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm8, %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm12, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm2 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm12, %xmm18, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm12 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm6, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rdi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <448 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 5b607748c5761..99932c0026b23 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -1364,90 +1364,55 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm8 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm6 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm7 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi) -; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx) -; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx) -; AVX2-FCP-NEXT: vmovq %xmm7, (%r8) -; AVX2-FCP-NEXT: vmovq %xmm8, (%r9) -; AVX2-FCP-NEXT: vmovq %xmm9, (%r11) -; AVX2-FCP-NEXT: vmovq %xmm10, (%r10) -; AVX2-FCP-NEXT: vmovq %xmm0, (%rax) +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FCP-NEXT: vmovq %xmm8, (%rdx) +; AVX2-FCP-NEXT: vmovq %xmm11, (%rcx) +; AVX2-FCP-NEXT: vmovq %xmm3, (%r8) +; AVX2-FCP-NEXT: vmovq %xmm4, (%r9) +; AVX2-FCP-NEXT: vmovq %xmm5, (%r11) +; AVX2-FCP-NEXT: vmovq %xmm6, (%r10) +; AVX2-FCP-NEXT: vmovq %xmm1, (%rax) +; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride8_vf8: @@ -2663,182 +2628,97 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i8_stride8_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm6 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm3 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm11 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm12 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm9[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm6[3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm1 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm5 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm14 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsi) -; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vmovaps %xmm2, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rdx) ; AVX2-FCP-NEXT: vmovdqa %xmm10, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r8) -; AVX2-FCP-NEXT: vmovdqa %xmm12, (%r9) +; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FCP-NEXT: vmovdqa %xmm5, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %xmm13, (%rax) +; AVX2-FCP-NEXT: vmovdqa %xmm12, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX2-FCP-NEXT: vmovdqa %xmm13, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rax) +; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: load_i8_stride8_vf16: @@ -2962,114 +2842,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX512-FCP-NEXT: vpmovqd %ymm9, %xmm8 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpmovqd %ymm10, %xmm11 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14 +; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11 +; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7] +; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX512-FCP-NEXT: vpsrlq $32, %zmm12, %zmm4 +; AVX512-FCP-NEXT: vpmovqb %zmm4, %xmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512-FCP-NEXT: vpsrlq $40, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm12, %zmm6 +; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512-FCP-NEXT: vpsrlq $56, %zmm12, %zmm7 ; AVX512-FCP-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8 -; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9 -; AVX512-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11 -; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1 -; AVX512-FCP-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rcx) ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r8) -; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r9) -; AVX512-FCP-NEXT: vmovdqa %xmm10, (%r11) -; AVX512-FCP-NEXT: vmovdqa %xmm11, (%r10) -; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512-FCP-NEXT: vmovdqa %xmm3, (%r9) +; AVX512-FCP-NEXT: vmovdqa %xmm4, (%r11) +; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r10) +; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -3194,114 +3037,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm9, %xmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm10, %xmm11 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm12, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm12, %zmm7 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, (%r11) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%r10) -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%r11) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r10) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -3426,114 +3232,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vpmovqb %zmm5, %xmm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vpmovqd %ymm4, %xmm3 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 +; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vpmovqd %ymm6, %xmm7 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2 +; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7 ; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm9, %zmm7 +; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm7 +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8 +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm9, %zmm8 ; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9 -; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm8 +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm9, %zmm10 ; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12 -; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %xmm10, (%r11) -; AVX512BW-FCP-NEXT: vmovdqa %xmm11, (%r10) -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm9, %zmm6 +; AVX512BW-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r11) +; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10) +; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -3658,114 +3427,77 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm5, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm6, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm9, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm5, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm9, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm5, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm5, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm9, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm5, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm5, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm10, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm11, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm9, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r11) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <128 x i8>, ptr %in.vec, align 64 @@ -6063,305 +5795,180 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i8_stride8_vf32: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $248, %rsp -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FCP-NEXT: subq $136, %rsp +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm11 -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm14 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vmovdqa %xmm10, %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm10 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm14 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm6 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm5 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm15, %xmm10 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm6, %xmm8 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa %xmm5, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm0, %xmm2 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm9 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm13[3] +; AVX2-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm13 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm2, %ymm15 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm14[1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FCP-NEXT: vmovdqa %ymm12, (%r9) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FCP-NEXT: addq $248, %rsp +; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FCP-NEXT: addq $136, %rsp ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -6721,231 +6328,186 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i8_stride8_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm18 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm20 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16 -; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpmovqb %zmm18, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm28 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpsrlq $8, %zmm18, %zmm8 -; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm18, %zmm8 -; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 +; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpmovqd %ymm12, %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm19 +; AVX512-FCP-NEXT: vpmovqd %ymm19, %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpmovqb %zmm20, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX512-FCP-NEXT: vpsrlq $24, %zmm18, %zmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpsrlq $8, %zmm20, %zmm6 ; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm4 -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7] -; AVX512-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3] -; AVX512-FCP-NEXT: vpsrlq $32, %zmm18, %zmm14 -; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm14 -; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] -; AVX512-FCP-NEXT: vpsrlq $40, %zmm18, %zmm4 -; AVX512-FCP-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm30 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm28 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm27 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm26 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm20, %zmm6 +; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2 ; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm14 +; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] +; AVX512-FCP-NEXT: vpsrlq $32, %zmm20, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512-FCP-NEXT: vpsrlq $40, %zmm20, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm13 +; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm13 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm20, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] +; AVX512-FCP-NEXT: vpsrlq $56, %zmm20, %zmm3 ; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, (%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm15, (%r9) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%rdx) +; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8) +; AVX512-FCP-NEXT: vmovdqa %ymm12, (%r9) +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax) -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -7305,231 +6867,186 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i8_stride8_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16 -; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpmovqb %zmm18, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm28 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm18, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm18, %zmm8 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm12, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm19 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm19, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpmovqb %zmm20, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm18, %zmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm20, %zmm6 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm4 -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3] -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm18, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm18, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm30 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm28 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm27 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm26 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm20, %zmm6 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm14 +; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm20, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm20, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm20, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3] +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm20, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, (%r9) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, (%r9) +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax) -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -7837,214 +7354,169 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride8_vf32: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm26 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm26, %ymm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm27 -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm27, %ymm3 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm30 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm28 -; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm28, %ymm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm3, %xmm17 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm5, %xmm16 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm16[0],xmm17[0],xmm16[1],xmm17[1],xmm16[2],xmm17[2],xmm16[3],xmm17[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512BW-FCP-NEXT: vpmovqb %zmm0, %xmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512BW-FCP-NEXT: vpmovqd %ymm12, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512BW-FCP-NEXT: vpmovqd %ymm16, %xmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512BW-FCP-NEXT: vpmovqb %zmm4, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm16 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm4, %ymm6 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm26, %ymm7 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm27, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm28, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm7 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm26, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm27, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm28, %ymm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm24 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm26, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm8 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm28, %ymm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm28 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm10 -; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] -; AVX512BW-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm10 -; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm11 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm10, %ymm13 -; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm8, %ymm15 -; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13 ; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm9, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm11, %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm10, %ymm13 -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10 +; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12 +; AVX512BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm9, %ymm13 -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm11, %ymm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm14 -; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm15, %ymm6 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm11, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm12, %xmm9 +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm4, %zmm14 ; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm11, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512BW-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %ymm29, (%rdx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rcx) -; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm12, (%r11) -; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r10) -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm8, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm13, %ymm7 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm9 +; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm4, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm10, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm8, %ymm8 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm13, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512BW-FCP-NEXT: vmovaps %ymm2, (%rsi) +; AVX512BW-FCP-NEXT: vmovdqa64 %ymm18, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r9) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rax) +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -8352,214 +7824,169 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf32: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm26 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm26, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm27 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm27, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm30 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm28 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm28, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm3, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm5, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm16[0],xmm17[0],xmm16[1],xmm17[1],xmm16[2],xmm17[2],xmm16[3],xmm17[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm12, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm16, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm16 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm4, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm26, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm27, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm28, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm26, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm27, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm28, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm26, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm28, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm28 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm10, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm8, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm9, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm11, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm10, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm9, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm11, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm15, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm11, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm12, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm4, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm11, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm29, (%rdx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm12, (%r11) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r10) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm8, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm13, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm4, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm10, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm8, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm13, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm18, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r9) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <256 x i8>, ptr %in.vec, align 64 @@ -13109,641 +12536,471 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i8_stride8_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $904, %rsp # imm = 0x388 -; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa 336(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm14 -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm4 +; AVX2-FCP-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm5 +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm10 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm13 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm13 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm3 +; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm9 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm9 -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm11 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm8 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm8 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm9 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm1 +; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm14 +; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 +; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm8 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm15 +; AVX2-FCP-NEXT: vmovdqa %xmm13, %xmm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm14 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm7 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm14 +; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm12 +; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm11 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm7 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm3 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm5 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm7 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm8[0,1,2],xmm7[3] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm11 +; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm11 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5],ymm15[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm12 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm6 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm7 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm5[2,3] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm7 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm2[3] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm11 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm10 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm4 +; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm0 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm1 +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm12 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm15 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm14 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm15 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm9 -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm11 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm13 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm14 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm2 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm4 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) @@ -13778,7 +13035,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-FCP-NEXT: addq $904, %rsp # imm = 0x388 +; AVX2-FCP-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -14534,557 +13791,428 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-FCP-LABEL: load_i8_stride8_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: subq $232, %rsp +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512-FCP-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512-FCP-NEXT: vpmovqb %zmm28, %xmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16 +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpmovqd %ymm2, %xmm4 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpmovqd %ymm3, %xmm12 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512-FCP-NEXT: vpmovqb %zmm29, %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12 ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 {%k1} +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm7 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm29 -; AVX512-FCP-NEXT: vpmovqb %zmm29, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512-FCP-NEXT: vpsrlq $8, %zmm28, %zmm3 -; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm15 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm16 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512-FCP-NEXT: vpsrlq $8, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX512-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm25 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm16 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512-FCP-NEXT: vpsrlq $16, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $24, %zmm28, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512-FCP-NEXT: vpsrlq $24, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31 +; AVX512-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9 -; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm20 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm22 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm23 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512-FCP-NEXT: vpsrlq $32, %zmm24, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 {%k1} -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512-FCP-NEXT: vpsrlq $32, %zmm29, %zmm12 -; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25 +; AVX512-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm14 +; AVX512-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28 +; AVX512-FCP-NEXT: vpmovqd %ymm27, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm15 +; AVX512-FCP-NEXT: vpmovqd %ymm28, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512-FCP-NEXT: vpmovqb %zmm30, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm2 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm19 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512-FCP-NEXT: vpsrlq $40, %zmm25, %zmm14 +; AVX512-FCP-NEXT: vpsrlq $8, %zmm29, %zmm14 ; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm17 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm20 -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512-FCP-NEXT: vpsrlq $40, %zmm29, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm18 -; AVX512-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm29 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14 +; AVX512-FCP-NEXT: vmovdqa %xmm8, %xmm11 +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512-FCP-NEXT: vpsrlq $8, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm25, %zmm14 +; AVX512-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14 ; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm15 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512-FCP-NEXT: vpsrlq $48, %zmm18, %zmm13 -; AVX512-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm21 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512-FCP-NEXT: vpsrlq $16, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] -; AVX512-FCP-NEXT: vpsrlq $56, %zmm25, %zmm11 -; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] +; AVX512-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8 +; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-FCP-NEXT: vpsrlq $24, %zmm30, %zmm1 +; AVX512-FCP-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3 ; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%r8) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm1, (%r9) +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1} +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vpermd %ymm27, %ymm16, %ymm13 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm16, %ymm12 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX512-FCP-NEXT: vpsrlq $32, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] +; AVX512-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512-FCP-NEXT: vpsrlq $40, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm24 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14 +; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512-FCP-NEXT: vpsrlq $48, %zmm30, %zmm15 +; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] +; AVX512-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8 +; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1} +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] +; AVX512-FCP-NEXT: vpsrlq $56, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-FCP-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-FCP-NEXT: addq $232, %rsp ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -15840,557 +14968,428 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-FCP-LABEL: load_i8_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: subq $232, %rsp +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512DQ-FCP-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm28, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16 +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm19 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpmovqd %ymm2, %xmm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpmovqd %ymm3, %xmm12 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512DQ-FCP-NEXT: vpmovqb %zmm29, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 {%k1} +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm29 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm29, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm15 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm16 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm28, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31 +; AVX512DQ-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm20 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm22 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm23 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm24, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm29, %zmm12 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25 +; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm14 +; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm27, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm15 +; AVX512DQ-FCP-NEXT: vpmovqd %ymm28, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm30, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm25, %zmm14 +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm29, %zmm14 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm17 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm29, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm18 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm29 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm22 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm25, %zmm14 +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm15 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm18, %zmm13 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm21 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm25, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm30, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%r8) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%r9) +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1} +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm16, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0 +; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm16, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm24 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm30, %zmm15 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] +; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r9) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FCP-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FCP-NEXT: addq $232, %rsp ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -17073,429 +16072,357 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride8_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: subq $328, %rsp # imm = 0x148 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm31 -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm31, %ymm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm19 -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29 -; AVX512BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm3 +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12 +; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22 +; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27 +; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa64 368(%rdi), %xmm21 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-FCP-NEXT: vmovdqa 336(%rdi), %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm28 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-FCP-NEXT: vpmovqb %zmm18, %xmm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24 +; AVX512BW-FCP-NEXT: vpmovqd %ymm24, %xmm18 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512BW-FCP-NEXT: vpmovqd %ymm23, %xmm17 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512BW-FCP-NEXT: vpmovqb %zmm26, %xmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm17 -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm1 +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm1 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm5 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm27 -; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm16, %ymm8 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 112(%rdi), %xmm26 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm26, %xmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 96(%rdi), %xmm24 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm24, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FCP-NEXT: vmovdqa64 80(%rdi), %xmm22 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm25 -; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm23 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31 +; AVX512BW-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpermd %ymm21, %ymm0, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm0 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 96(%rdi), %ymm28 +; AVX512BW-FCP-NEXT: vpmovqd %ymm28, %xmm5 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-FCP-NEXT: vpmovqd %ymm25, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm0 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm13 +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm13 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm15 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm23 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: vmovdqa %xmm12, %xmm7 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm25 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm20 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm20[0],xmm25[0],xmm20[1],xmm25[1],xmm20[2],xmm25[2],xmm20[3],xmm25[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm18, %zmm15 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15 ; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm4, %ymm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm26, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm24, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm13 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm15 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm10, %zmm13 -; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm10, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm31, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm11 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, %xmm1 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm7, %xmm23 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm20 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm20[0],xmm15[0],xmm20[1],xmm15[1],xmm20[2],xmm15[2],xmm20[3],xmm15[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm18, %zmm15 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15 ; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm17, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm3, %ymm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm15 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm8, %zmm15 +; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm31, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm19, %ymm9 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm17, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm16, %ymm1 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm26, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm24, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm22, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12 +; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm7, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm11, %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm10, %zmm2 +; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm8, %zmm2 ; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm2 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermd %ymm29, %ymm3, %ymm14 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm19, %xmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm19, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm20, %xmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm20, %xmm17 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm28, %xmm29 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm18, %zmm15 -; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] +; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2 +; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4 +; AVX512BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm13 +; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm17, %ymm15 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512BW-FCP-NEXT: vpermd %ymm31, %ymm6, %ymm16 +; AVX512BW-FCP-NEXT: vpermd %ymm21, %ymm6, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm6 +; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] +; AVX512BW-FCP-NEXT: vpermd %ymm28, %ymm12, %ymm13 +; AVX512BW-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm18 +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm13, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm8, %zmm12 +; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm19 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm19, %ymm9 -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm18 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm15 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm3, %ymm21 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm21, %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm22, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm8, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm29 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm9 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm11 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm16, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm17, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm23, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm27 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm25, %zmm12 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm17, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm16, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm9 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12 ; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm18, %ymm12 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm21, %ymm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm12 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm20 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm16, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm17, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm23, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm28, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm27, %zmm12 -; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm19, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm12 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm21, %ymm13 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm26, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm24, %xmm4 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm22, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm10, %zmm11 -; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm12 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm18, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm8, %zmm14 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0 -; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm5, %ymm1 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm2 +; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm16, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm4 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm23, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm28, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm27, %zmm9 -; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm19, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm18, %ymm9 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm15, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm21, %ymm11 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm26, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm24, %xmm1 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm22, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm10, %zmm4 -; AVX512BW-FCP-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm5, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm26, %zmm2 +; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm17, %ymm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm15, %ymm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm13, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rsi) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rdx) ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rcx) -; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-FCP-NEXT: addq $328, %rsp # imm = 0x148 +; AVX512BW-FCP-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -18178,429 +17105,357 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: subq $328, %rsp # imm = 0x148 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm31 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm31, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 368(%rdi), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm21, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 336(%rdi), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %xmm28 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm18, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm24, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm23, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm26, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm27 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm16, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 112(%rdi), %xmm26 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm26, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 96(%rdi), %xmm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm24, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 80(%rdi), %xmm22 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1,2],xmm10[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm0, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 96(%rdi), %ymm28 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm28, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm25, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm30, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm31, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm21, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm23[0],xmm15[0],xmm23[1],xmm15[1],xmm23[2],xmm15[2],xmm23[3],xmm15[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm23 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm12, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm12, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm28, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm20[0],xmm25[0],xmm20[1],xmm25[1],xmm20[2],xmm25[2],xmm20[3],xmm25[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm18, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm17, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm4, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm26, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm24, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm22, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm8, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm10, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm10, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm31, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm19, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm7, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm20[0],xmm15[0],xmm20[1],xmm15[1],xmm20[2],xmm15[2],xmm20[3],xmm15[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm18, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm17, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm3, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm8, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm31, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm19, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm17, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm16, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm26, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm24, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm22, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm7, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm11, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm10, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm8, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermd (%rsp), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm29, %ymm3, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm19, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm19, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm20, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm20, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm20 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm28, %xmm29 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm29[0],xmm15[0],xmm29[1],xmm15[1],xmm29[2],xmm15[2],xmm29[3],xmm15[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm18, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm17, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm31, %ymm6, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm6, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm28, %ymm12, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm13, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm8, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm19 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm19, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm18 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm3, %ymm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm21, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm22, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm8, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm16, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm17, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm23, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm28, %xmm27 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm20[0],xmm27[1],xmm20[1],xmm27[2],xmm20[2],xmm27[3],xmm20[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm25, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm17, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm16, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm18, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm21, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm24, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm22, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm16, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm17, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm23, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm28, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm27, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm19, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm18, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm21, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm26, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm24, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm22, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm10, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm18, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm8, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm5, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm14, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm16, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm23, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm28, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm27, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm19, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm18, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm15, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm21, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm26, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm24, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm22, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm10, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm5, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm26, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm17, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm15, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm13, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-BW-FCP-NEXT: addq $328, %rsp # imm = 0x148 +; AVX512DQ-BW-FCP-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <512 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 941b18db0931a..f7a44fea5b02b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -1185,451 +1185,429 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-LABEL: store_i8_stride5_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero -; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero -; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[19,27],zero,zero,zero,ymm6[20,28],zero,zero,zero,ymm6[21,29],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] -; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] -; AVX2-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero +; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero +; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero +; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22] +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX2-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX2-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i8_stride5_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero -; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[19,27],zero,zero,zero,ymm6[20,28],zero,zero,zero,ymm6[21,29],zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] -; AVX2-FP-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] -; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6],zero,zero,zero,zero,ymm2[7],zero,zero,zero,zero,ymm2[8],zero,zero,zero,zero,ymm2[9,25],zero,zero,zero,zero,ymm2[26],zero,zero,zero,zero,ymm2[27],zero,zero,zero,zero,ymm2[28] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero +; AVX2-FP-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9],zero,zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28],zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[5,13],zero,zero,zero,xmm5[6,14],zero,zero,zero,xmm5[7,15],zero,zero,zero +; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,0] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3,19],zero,zero,zero,ymm1[28,20],zero,zero,zero,ymm1[29,21],zero,zero,zero,ymm1[30,22] +; AVX2-FP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX2-FP-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX2-FP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i8_stride5_vf16: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,ymm7[1,9],zero,zero,zero,ymm7[2,10],zero,zero,zero,ymm7[19,27],zero,zero,zero,ymm7[20,28],zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,ymm8[1,9],zero,zero,zero,ymm8[2,10],zero,zero,zero,ymm8[3,19],zero,zero,zero,ymm8[28,20],zero,zero,zero,ymm8[29,21],zero,zero,zero,ymm8[30,22] -; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,0,1,1] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm5[3,7],zero,zero,zero,ymm5[8,12],zero,zero,zero,ymm5[9,13],zero,zero,zero,ymm5[18,22],zero,zero,zero,ymm5[19,23],zero,zero,zero,ymm5[24,28],zero,zero -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6],zero,zero,zero,ymm6[3,7],zero,zero,zero,ymm6[8,12],zero,zero,zero,ymm6[9,17],zero,zero,zero,ymm6[22,18],zero,zero,zero,ymm6[23,19],zero,zero,zero,ymm6[24,28] -; AVX2-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,1,2,2,2,2,2,2] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,13],zero,zero,zero,xmm3[6,14],zero,zero,zero,xmm3[7,15],zero +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm4[5,13],zero,zero,zero,xmm4[6,14],zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[19,27],zero,zero,zero,ymm4[20,28],zero,zero,zero,ymm4[21,29],zero,zero,zero +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,2,0] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22] +; AVX2-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,0,0,1,1] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,2,6,3,7] +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[3,7],zero,zero,zero,ymm1[8,12],zero,zero,zero,ymm1[9,13],zero,zero,zero,ymm1[18,22],zero,zero,zero,ymm1[19,23],zero,zero,zero,ymm1[24,28],zero,zero +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] +; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,6],zero,zero,zero,ymm2[3,7],zero,zero,zero,ymm2[8,12],zero,zero,zero,ymm2[9,17],zero,zero,zero,ymm2[22,18],zero,zero,zero,ymm2[23,19],zero,zero,zero,ymm2[24,28] +; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,1,2,2,2,2,2,2] +; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9) -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r9) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i8_stride5_vf16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6 -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,7],zero,ymm6[u,u,u,8],zero,ymm6[u,u,u,9],zero,ymm6[u,u,u],zero,ymm6[26,u,u,u],zero,ymm6[27,u,u,u],zero,ymm6[28,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[7,u,u,u],zero,ymm8[8,u,u,u],zero,ymm8[9,u,u,u,26],zero,ymm8[u,u,u,27],zero,ymm8[u,u,u,28],zero,ymm8[u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm7) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,ymm5[u,u,u,7],zero,ymm5[u,u,u,8],zero,ymm5[u,u,u,9,25,u,u,u],zero,ymm5[26,u,u,u],zero,ymm5[27,u,u,u],zero,ymm5[28] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6,u,u,u],zero,ymm5[7,u,u,u],zero,ymm5[8,u,u,u],zero,zero,ymm5[u,u,u,26],zero,ymm5[u,u,u,27],zero,ymm5[u,u,u,28],zero -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 & (ymm5 | ymm8) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,1,9],zero,zero,ymm6[u,2,10],zero,zero,ymm6[u,3,19],zero,zero,ymm6[u,28,20],zero,zero,ymm6[u,29,21],zero,zero,ymm6[u,30,22] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-NEXT: vpermd %zmm1, %zmm6, %zmm6 -; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u] -; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12],zero,zero,zero,zero,xmm1[13],zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,xmm1[15] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vmovdqa (%r8), %xmm0 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride5_vf16: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u] -; AVX512-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] ; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride5_vf16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,7],zero,ymm6[u,u,u,8],zero,ymm6[u,u,u,9],zero,ymm6[u,u,u],zero,ymm6[26,u,u,u],zero,ymm6[27,u,u,u],zero,ymm6[28,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[7,u,u,u],zero,ymm8[8,u,u,u],zero,ymm8[9,u,u,u,26],zero,ymm8[u,u,u,27],zero,ymm8[u,u,u,28],zero,ymm8[u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm7) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,ymm5[u,u,u,7],zero,ymm5[u,u,u,8],zero,ymm5[u,u,u,9,25,u,u,u],zero,ymm5[26,u,u,u],zero,ymm5[27,u,u,u],zero,ymm5[28] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6,u,u,u],zero,ymm5[7,u,u,u],zero,ymm5[8,u,u,u],zero,zero,ymm5[u,u,u,26],zero,ymm5[u,u,u,27],zero,ymm5[u,u,u,28],zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm9 & (ymm5 | ymm8) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,1,9],zero,zero,ymm6[u,2,10],zero,zero,ymm6[u,3,19],zero,zero,ymm6[u,28,20],zero,zero,ymm6[u,29,21],zero,zero,ymm6[u,30,22] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u] -; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12],zero,zero,zero,zero,xmm1[13],zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,xmm1[15] -; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[6],zero,ymm2[u,u,u,7],zero,ymm2[u,u,u,8],zero,ymm2[u,u,u,9,25,u,u,u],zero,ymm2[26,u,u,u],zero,ymm2[27,u,u,u],zero,ymm2[28] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[6,u,u,u],zero,ymm6[7,u,u,u],zero,ymm6[8,u,u,u],zero,zero,ymm6[u,u,u,26],zero,ymm6[u,u,u,27],zero,ymm6[u,u,u,28],zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 & (ymm6 | ymm4) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] +; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride5_vf16: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u] -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,5,2,6,6,2,3,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,5,13],zero,zero,xmm1[u,6,14],zero,zero,xmm1[u,7,15],zero,zero,xmm1[u] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i8_stride5_vf16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9],zero,zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28],zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[6],zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero -; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero +; AVX512BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero +; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm8 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[3,19],zero,zero,zero,ymm6[28,20],zero,zero,zero,ymm6[29,21],zero,zero,zero,ymm6[30,22] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] +; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512BW-NEXT: vpermd %zmm4, %zmm6, %zmm6 +; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2 -; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 +; AVX512BW-NEXT: vmovdqa %xmm2, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i8_stride5_vf16: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7] -; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[0,8],zero,zero,zero,zmm7[1,9],zero,zero,zero,zmm7[2,10],zero,zero,zero,zmm7[3,19],zero,zero,zero,zmm7[28,20],zero,zero,zero,zmm7[29,21],zero,zero,zero,zmm7[30,22,34,38],zero,zero,zero,zmm7[35,39],zero,zero,zero,zmm7[40,44],zero,zero,zero,zmm7[41,49],zero,zero,zero,zmm7[54,50],zero,zero,zero,zmm7[55,51],zero,zero,zero,zmm7[56,60] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7] +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero +; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512BW-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2 -; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1 +; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i8_stride5_vf16: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9],zero,zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28],zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[6],zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero -; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm8 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[3,19],zero,zero,zero,ymm6[28,20],zero,zero,zero,ymm6[29,21],zero,zero,zero,ymm6[30,22] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512DQ-BW-NEXT: vpermd %zmm4, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2 -; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 +; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 64(%r9) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf16: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[0,8],zero,zero,zero,zmm7[1,9],zero,zero,zero,zmm7[2,10],zero,zero,zero,zmm7[3,19],zero,zero,zero,zmm7[28,20],zero,zero,zero,zmm7[29,21],zero,zero,zero,zmm7[30,22,34,38],zero,zero,zero,zmm7[35,39],zero,zero,zero,zmm7[40,44],zero,zero,zero,zmm7[41,49],zero,zero,zero,zmm7[54,50],zero,zero,zero,zmm7[55,51],zero,zero,zero,zmm7[56,60] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,5,2,6,6,2,3,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zmm4[1,9],zero,zero,zero,zmm4[2,10],zero,zero,zero,zmm4[19,27],zero,zero,zero,zmm4[20,28],zero,zero,zero,zmm4[21,29],zero,zero,zero,zero,zero,zero,zmm4[35,39],zero,zero,zero,zmm4[40,44],zero,zero,zero,zmm4[41,45],zero,zero,zero,zmm4[50,54],zero,zero,zero,zmm4[51,55],zero,zero,zero,zmm4[56,60],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,13],zero,zero,zero,xmm1[6,14],zero,zero,zero,xmm1[7,15],zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5,13],zero,zero,zero,xmm0[6,14],zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm2 | xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 98a64ee987f7b..ab968b91153a9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1813,81 +1813,79 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: # %bb.0: ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-NEXT: vmovdqa (%r8), %xmm3 -; AVX2-NEXT: vmovdqa (%r9), %xmm4 -; AVX2-NEXT: vmovdqa (%r10), %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero,zero,ymm11[25] -; AVX2-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %ymm12, %ymm11, %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4],zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero -; AVX2-NEXT: vpor %ymm12, %ymm11, %ymm11 -; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] -; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28] -; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,2,0,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero -; AVX2-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[3,19],zero,zero,zero,zero,zero,ymm8[28,20],zero,zero,zero,zero,zero,ymm8[29,21],zero +; AVX2-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX2-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] +; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] +; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero +; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] +; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] +; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28] +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,0,2] +; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero +; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] +; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero +; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] -; AVX2-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0] +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1895,77 +1893,75 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FP-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm2, %ymm2 +; AVX2-FP-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm10 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[5],zero,zero,zero,zero,zero,zero,ymm9[6],zero,zero,zero,zero,zero,ymm9[23],zero,zero,zero,zero,zero,zero,ymm9[24],zero,zero,zero,zero,zero,zero,ymm9[25] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25] +; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[5],zero,zero,zero,zero,zero,zero,ymm3[6],zero,zero,zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,ymm3[24],zero,zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] +; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm7, %ymm4, %ymm7 +; AVX2-FP-NEXT: vmovdqa (%r10), %xmm4 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[18,26],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,8],zero,zero,zero,zero,zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[19,27],zero,zero,zero,zero,zero,ymm9[20,28],zero,zero ; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm11, %ymm9, %ymm9 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm11[4],zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm11, %ymm8, %ymm11 -; AVX2-FP-NEXT: vmovdqa (%r10), %xmm8 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,2,0,2] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero -; AVX2-FP-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,1,1,3] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,zero,zero,zero,ymm6[10,2],zero,zero,zero,zero,zero,ymm6[11,3],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero,zero,zero,zero,ymm6[21,29],zero,zero,zero -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[2,10],zero,zero,zero,zero,zero,ymm7[3,19],zero,zero,zero,zero,zero,ymm7[28,20],zero,zero,zero,zero,zero,ymm7[29,21],zero -; AVX2-FP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm6 -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[12,13],zero,zero,zero,zero,zero,xmm4[14,15],zero,zero,zero -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX2-FP-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm8[13,14,15,4,5],zero,zero,xmm8[14,15,14,15,12],zero,zero,xmm8[15] -; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm2[3,1,1,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm3[1,3,3,1] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero +; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[6,14],zero,zero,zero,zero,zero,xmm2[7,15],zero,zero,zero,zero,zero +; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,zero,zero,zero,zero,xmm0[13,12],zero,zero,zero,zero,zero,xmm0[15,14],zero +; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm4[13,14,15,4,5],zero,zero,xmm4[14,15,14,15,12],zero,zero,xmm4[15] +; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-FP-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FP-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; @@ -1973,75 +1969,73 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm2 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[1,1,0,0,4,5,6,7] -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm7 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,ymm11[0,8],zero,zero,zero,zero,zero,ymm11[1,9],zero,zero,zero,zero,zero,ymm11[18,26],zero,zero,zero,zero,zero,ymm11[19,27],zero,zero,zero,zero,zero,ymm11[20,28] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[2,10],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28],zero,zero -; AVX2-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,5,2,6,1,5,2,6] -; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm13 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm12, %ymm13 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,ymm13[1,5],zero,zero,zero,zero,zero,ymm13[2,6],zero,zero,zero,zero,zero,ymm13[19,23],zero,zero,zero,zero,zero,ymm13[24,28],zero,zero,zero,zero -; AVX2-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,ymm12[1,5],zero,zero,zero,zero,zero,ymm12[2,6],zero,zero,zero,zero,zero,ymm12[19,23],zero,zero,zero,zero,zero,ymm12[24,28],zero,zero,zero,zero,zero,ymm12[25] -; AVX2-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,2,3,3,2,2,3,3] -; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] -; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-FCP-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX2-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,2,0,0,1] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero +; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,5,2,6,1,5,2,6] +; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u] +; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm9[1,5],zero,zero,zero,zero,zero,ymm9[2,6],zero,zero,zero,zero,zero,ymm9[19,23],zero,zero,zero,zero,zero,ymm9[24,28],zero,zero,zero,zero +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm8[1,5],zero,zero,zero,zero,zero,ymm8[2,6],zero,zero,zero,zero,zero,ymm8[19,23],zero,zero,zero,zero,zero,ymm8[24,28],zero,zero,zero,zero,zero,ymm8[25] +; AVX2-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] +; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[3,1,1,3] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,3,3,1] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9],zero,zero,zero,zero,zero,ymm9[2,10],zero,zero,zero,zero,zero,ymm9[3,19],zero,zero,zero,zero,zero,ymm9[28,20],zero,zero,zero,zero,zero,ymm9[29,21],zero ; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm10, %ymm8 -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm6, %ymm6 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero +; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,14,15,4,5],zero,zero,xmm2[14,15,14,15,12],zero,zero,xmm2[15] -; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FCP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,14,15,4,5],zero,zero,xmm0[14,15,14,15,12],zero,zero,xmm0[15] +; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FCP-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%rax) -; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; @@ -2049,76 +2043,74 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512-NEXT: vmovdqa (%r8), %xmm3 -; AVX512-NEXT: vmovdqa (%r9), %xmm4 -; AVX512-NEXT: vmovdqa (%r10), %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ~ymm12 & (ymm11 | ymm10) -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm12 & (ymm13 | ymm11) -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 -; AVX512-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 & (ymm12 | ymm11) -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX512-NEXT: vpandn %ymm12, %ymm13, %ymm12 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512-NEXT: vporq %zmm12, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] -; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm9 & ~mem) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm8)) -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-NEXT: vmovdqa (%r8), %xmm1 +; AVX512-NEXT: vmovdqa (%r9), %xmm2 +; AVX512-NEXT: vmovdqa (%r10), %xmm0 +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6) +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512-NEXT: vporq %zmm6, %zmm7, %zmm6 +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX512-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512-NEXT: vporq %zmm8, %zmm7, %zmm7 +; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -2126,69 +2118,67 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm12 & mem) -; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6] +; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem) +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ~mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] +; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2196,76 +2186,74 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ~ymm12 & (ymm11 | ymm10) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm12 & (ymm13 | ymm11) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 -; AVX512DQ-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm13 & (ymm12 | ymm11) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX512DQ-NEXT: vpandn %ymm12, %ymm13, %ymm12 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512DQ-NEXT: vporq %zmm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] -; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm9 & ~mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm8)) -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-NEXT: vporq %zmm6, %zmm7, %zmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm9 & (ymm8 | ymm7) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,0] +; AVX512DQ-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512DQ-NEXT: vporq %zmm8, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512DQ-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 -; AVX512DQ-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2273,69 +2261,67 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm12 & mem) -; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6] +; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,0,1,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm8 & mem) +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ~mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2343,82 +2329,80 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm6, %ymm6 -; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm6, %zmm6 -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512BW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,7,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,3,2] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] ; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[12,13],zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[12,13],zero,zero,zero,zero,zero,xmm3[14,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm2 {%k1} +; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1} ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm8, %ymm3, %ymm3 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-NEXT: vpermw %ymm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,3,1] +; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[3,1,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,zero,zero,zero,ymm5[10,2],zero,zero,zero,zero,zero,ymm5[11,3],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero,zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero +; AVX512BW-NEXT: vpor %ymm3, %ymm7, %ymm3 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512BW-NEXT: vpor %ymm4, %ymm7, %ymm4 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] ; AVX512BW-NEXT: vpor %ymm7, %ymm6, %ymm6 ; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28] -; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28] +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm2, 96(%rax) +; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm5, 96(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-NEXT: vzeroupper @@ -2428,72 +2412,70 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm4, %ymm4 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] -; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,3,1,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,3,3,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[3,1,1,3] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6] -; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm7 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm4 {%k1} +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm6 +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] +; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm7 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] ; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm3 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,2,0,2] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zmm2[33,37],zero,zero,zero,zero,zero,zmm2[34,38],zero,zero,zero,zero,zero,zmm2[51,55],zero,zero,zero,zero,zero,zmm2[56,60],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm6 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57] +; AVX512BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 96(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2501,82 +2483,80 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,7,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,3,2] +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,7,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] ; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[12,13],zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[12,13],zero,zero,zero,zero,zero,xmm3[14,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 ; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm7, %xmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1} ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermw %ymm8, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-NEXT: vpermw %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512DQ-BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,3,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,3,3,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[3,1,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,zero,zero,zero,ymm5[10,2],zero,zero,zero,zero,zero,ymm5[11,3],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero,zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm7, %ymm3 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} +; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm7, %ymm4 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] ; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm6, %ymm6 ; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[18,26],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28] -; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[19,27],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28] +; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, 96(%rax) +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa %xmm5, 96(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper @@ -2586,72 +2566,70 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm4, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,zero,zero,zero,zero,xmm4[13,12],zero,zero,zero,zero,zero,xmm4[15,14],zero +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[13],zero,zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,zero,xmm5[15] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,3,1,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,3,3,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[3,1,1,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,3,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[3,19],zero,zero,zero,zero,zero,ymm4[28,20],zero,zero,zero,zero,zero,ymm4[29,21],zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm1[3,1,1,3] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6] -; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] +; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] ; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,2,0,2] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zmm2[33,37],zero,zero,zero,zero,zero,zmm2[34,38],zero,zero,zero,zero,zero,zmm2[51,55],zero,zero,zero,zero,zero,zmm2[56,60],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero,zero,zmm1[57] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zmm0[20,28],zero,zero,zero,zero,zero,zmm0[33,37],zero,zero,zero,zero,zero,zmm0[34,38],zero,zero,zero,zero,zero,zmm0[51,55],zero,zero,zero,zero,zero,zmm0[56,60],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 96(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 9e82c84fe5520..ec54b75513582 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -946,10 +946,8 @@ define <2 x i64> @PR116815(<4 x i64> %v0, <4 x i64> %v1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpslld $16, %ymm1, %ymm1 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,2,6,10,14,u,u,u,u,u,u,u,u,16,20,24,28,18,22,26,30,u,u,u,u,u,u,u,u] ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vmovq {{.*#+}} xmm2 = [0,4,8,12,2,6,10,14,0,0,0,0,0,0,0,0] -; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 3d49edbb7bd8d..3e76bffb77a66 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -483,50 +483,42 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; ; AVX2-LABEL: interleaved_load_vf16_i8_stride4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,0,0,4] +; AVX2-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpxor %xmm0, %xmm4, %xmm0 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_load_vf16_i8_stride4: @@ -646,76 +638,66 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; ; AVX2-LABEL: interleaved_load_vf32_i8_stride4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] -; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-NEXT: vpermd %ymm7, %ymm6, %ymm7 -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm7 +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX2-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm8 +; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm11 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm7, %ymm0 +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpcmpeqb %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll b/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll index 102ee64d4638d..63139cacd8fba 100644 --- a/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll +++ b/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll @@ -34,6 +34,7 @@ ; RUN: -supports-hot-cold-new \ ; RUN: -thinlto-distributed-indexes \ ; RUN: -r=%t/b.o,_Z3fooi,plx \ +; RUN: -r=%t/b.o,aliasee,plx \ ; RUN: -r=%t/b.o,a \ ; RUN: -r=%t/b.o,b \ ; RUN: -r=%t/b.o,_Znam \ @@ -65,11 +66,15 @@ source_filename = "b.cpp" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +;; Make sure the distributed summary bitcode writing succeeds when the memprof +;; metadata is in an aliasee. +@_Z3fooi = alias void (), ptr @aliasee + @a = external local_unnamed_addr global ptr, align 8 @b = external local_unnamed_addr global i32, align 4 ; Function Attrs: mustprogress uwtable -define dso_local void @_Z3fooi(i32 noundef %0) local_unnamed_addr #0 !dbg !9 { +define dso_local void @aliasee(i32 noundef %0) local_unnamed_addr #0 !dbg !9 { br label %2, !dbg !12 2: ; preds = %7, %1 @@ -222,4 +227,4 @@ attributes #1 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "t !19 = !DILocation(line: 7, column: 5, scope: !9) !20 = !{i64 8256520048276991898} !21 = !DILocation(line: 8, column: 5, scope: !9) -!22 = !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !1, file: !1, line: 1, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) \ No newline at end of file +!22 = !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !1, file: !1, line: 1, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/ctpop-range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/ctpop-range.ll new file mode 100644 index 0000000000000..7101244dff4c4 --- /dev/null +++ b/llvm/test/Transforms/CorrelatedValuePropagation/ctpop-range.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=correlated-propagation %s | FileCheck %s + +declare void @use(i1) + +define void @ctpop1(i8 %v) { +; CHECK-LABEL: define void @ctpop1( +; CHECK-SAME: i8 [[V:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RES:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[V]]) +; CHECK-NEXT: [[C0_0:%.*]] = icmp samesign uge i8 [[RES]], 3 +; CHECK-NEXT: [[C0_1:%.*]] = icmp samesign ule i8 [[RES]], 7 +; CHECK-NEXT: [[C0:%.*]] = and i1 [[C0_0]], [[C0_1]] +; CHECK-NEXT: br i1 [[C0]], label %[[RANGE_3_8:.*]], label %[[ED:.*]] +; CHECK: [[RANGE_3_8]]: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[CMP1:%.*]] = icmp uge i8 [[V]], 8 +; CHECK-NEXT: call void @use(i1 [[CMP1]]) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[CMP3:%.*]] = icmp ule i8 [[V]], -3 +; CHECK-NEXT: call void @use(i1 [[CMP3]]) +; CHECK-NEXT: ret void +; CHECK: [[ED]]: +; CHECK-NEXT: ret void +; +entry: + %res = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %v) + %c0.0 = icmp uge i8 %res, 3 + %c0.1 = icmp ule i8 %res, 7 + %c0 = and i1 %c0.0, %c0.1 + br i1 %c0, label %range.3.8, label %ed + +range.3.8: + %cmp0 = icmp uge i8 %v, 7 + call void @use(i1 %cmp0) ; true + %cmp1 = icmp uge i8 %v, 8 + call void @use(i1 %cmp1) ; unknown + %cmp2 = icmp ule i8 %v, 254 + call void @use(i1 %cmp2) ; true + %cmp3 = icmp ule i8 %v, 253 + call void @use(i1 %cmp3) ; unknown + ret void + +ed: + ret void +} + +define void @ctpop2(i8 %v) { +; CHECK-LABEL: define void @ctpop2( +; CHECK-SAME: i8 [[V:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RES:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[V]]) +; CHECK-NEXT: [[C2_0:%.*]] = icmp samesign uge i8 [[RES]], 1 +; CHECK-NEXT: [[C2_1:%.*]] = icmp samesign ule i8 [[RES]], 4 +; CHECK-NEXT: [[C2:%.*]] = and i1 [[C2_0]], [[C2_1]] +; CHECK-NEXT: br i1 [[C2]], label %[[RANGE_1_5:.*]], label %[[ED:.*]] +; CHECK: [[RANGE_1_5]]: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[CMP9:%.*]] = icmp uge i8 [[V]], 2 +; CHECK-NEXT: call void @use(i1 [[CMP9]]) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[CMP11:%.*]] = icmp ule i8 [[V]], -17 +; CHECK-NEXT: call void @use(i1 [[CMP11]]) +; CHECK-NEXT: ret void +; CHECK: [[ED]]: +; CHECK-NEXT: ret void +; +entry: + %res = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %v) + %c2.0 = icmp uge i8 %res, 1 + %c2.1 = icmp ule i8 %res, 4 + %c2 = and i1 %c2.0, %c2.1 + br i1 %c2, label %range.1.5, label %ed + +range.1.5: + %cmp8 = icmp uge i8 %v, 1 + call void @use(i1 %cmp8) ; true + %cmp9 = icmp uge i8 %v, 2 + call void @use(i1 %cmp9) ; unknown + %cmp10 = icmp ule i8 %v, 240 + call void @use(i1 %cmp10) ; true + %cmp11 = icmp ule i8 %v, 239 + call void @use(i1 %cmp11) ; unknown + ret void + +ed: + ret void +} + +define void @ctpop3(i8 %v) { +; CHECK-LABEL: define void @ctpop3( +; CHECK-SAME: i8 [[V:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ctpop.i8(i8 [[V]]) +; CHECK-NEXT: [[C3:%.*]] = icmp samesign uge i8 [[RES]], 8 +; CHECK-NEXT: br i1 [[C3]], label %[[RANGE_8_9:.*]], label %[[ED:.*]] +; CHECK: [[RANGE_8_9]]: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret void +; CHECK: [[ED]]: +; CHECK-NEXT: ret void +; +entry: + %res = call i8 @llvm.ctpop.i8(i8 %v) + %c3 = icmp uge i8 %res, 8 + br i1 %c3, label %range.8.9, label %ed + +range.8.9: + %cmp4 = icmp eq i8 %v, -1 + call void @use(i1 %cmp4) ; true + ret void + +ed: + ret void +} + +define void @ctpop4(i8 %v) { +; CHECK-LABEL: define void @ctpop4( +; CHECK-SAME: i8 [[V:%.*]]) { +; CHECK-NEXT: [[TEST4:.*:]] +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.ctpop.i8(i8 [[V]]) +; CHECK-NEXT: [[C4:%.*]] = icmp eq i8 [[RES]], 0 +; CHECK-NEXT: br i1 [[C4]], label %[[RANGE_0_1:.*]], label %[[ED:.*]] +; CHECK: [[RANGE_0_1]]: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret void +; CHECK: [[ED]]: +; CHECK-NEXT: ret void +; +test4: + %res = call i8 @llvm.ctpop.i8(i8 %v) + %c4 = icmp eq i8 %res, 0 + br i1 %c4, label %range.0.1, label %ed + +range.0.1: + %cmp5 = icmp eq i8 %v, 0 + call void @use(i1 %cmp5) ; true + ret void + +ed: + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll b/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll index 09d8bbf3c93bc..947d8a788c244 100644 --- a/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll +++ b/llvm/test/Transforms/DeadStoreElimination/memory-intrinsics-sizes.ll @@ -66,6 +66,81 @@ define void @memset_and_store_2(ptr %ptr, i64 %len) { ret void } +define void @memset_pattern_equal_size_values(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_equal_size_values( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + ret void +} + +define void @memset_pattern_different_size_values_1(ptr %ptr, i64 %len.1, i64 %len.2) { +; CHECK-LABEL: @memset_pattern_different_size_values_1( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 [[LEN_1:%.*]], i1 false) +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 [[LEN_2:%.*]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len.1, i1 false) + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len.2, i1 false) + ret void +} + +define void @memset_pattern_different_size_values_2(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_different_size_values_2( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 100, i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 100, i1 false) + ret void +} + +define void @memset_pattern_different_size_values_3(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_different_size_values_3( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 100, i1 false) +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 100, i1 false) + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + ret void +} + +define void @memset_pattern_and_store_1(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_and_store_1( +; CHECK-NEXT: store i64 123, ptr [[PTR:%.*]], align 4 +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: ret void +; + store i64 123, ptr %ptr + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + ret void +} + +define void @memset_pattern_and_store_2(ptr %ptr, i64 %len) { +; CHECK-LABEL: @memset_pattern_and_store_2( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 [[LEN:%.*]], i1 false) +; CHECK-NEXT: store i64 123, ptr [[PTR]], align 4 +; CHECK-NEXT: ret void +; + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 %len, i1 false) + store i64 123, ptr %ptr + ret void +} + +define void @memset_pattern_and_store_3(ptr %ptr) { +; CHECK-LABEL: @memset_pattern_and_store_3( +; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 [[PTR:%.*]], i8 0, i64 13, i1 false) +; CHECK-NEXT: ret void +; + store i64 0, ptr %ptr + call void @llvm.experimental.memset.pattern.p0.i8.i64(ptr align 1 %ptr, i8 0, i64 13, i1 false) + ret void +} + define void @memcpy_equal_size_values(ptr noalias %src, ptr noalias %dst, i64 %len) { ; CHECK-LABEL: @memcpy_equal_size_values( ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]], i1 false) diff --git a/llvm/test/Transforms/IndVarSimplify/implied-via-addition.ll b/llvm/test/Transforms/IndVarSimplify/implied-via-addition.ll new file mode 100644 index 0000000000000..865c10e3913aa --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/implied-via-addition.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=indvars -S < %s | FileCheck %s + +declare void @use(i1) + +declare void @llvm.experimental.guard(i1, ...) + +define void @test_01(i8 %t) { +; CHECK-LABEL: define void @test_01( +; CHECK-SAME: i8 [[T:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ST:%.*]] = sext i8 [[T]] to i16 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i16 [[ST]], 42 +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[CMP1]]) [ "deopt"() ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[T]], %[[ENTRY]] ], [ [[IDX_INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IDX_INC]] = add nsw i8 [[IDX]], 1 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[BE:%.*]] = icmp slt i8 [[IDX_INC]], 42 +; CHECK-NEXT: br i1 [[BE]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + entry: + %st = sext i8 %t to i16 + %cmp1 = icmp slt i16 %st, 42 + call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ] + br label %loop + + loop: + %idx = phi i8 [ %t, %entry ], [ %idx.inc, %loop ] + %idx.inc = add i8 %idx, 1 + %c = icmp slt i8 %idx, 42 + call void @use(i1 %c) + %be = icmp slt i8 %idx.inc, 42 + br i1 %be, label %loop, label %exit + + exit: + ret void +} + +define void @test_02(i8 %t) { +; CHECK-LABEL: define void @test_02( +; CHECK-SAME: i8 [[T:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[T_PTR:%.*]] = inttoptr i8 [[T]] to ptr +; CHECK-NEXT: [[P_42:%.*]] = inttoptr i8 42 to ptr +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt ptr [[T_PTR]], [[P_42]] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[CMP1]]) [ "deopt"() ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IDX:%.*]] = phi ptr [ [[T_PTR]], %[[ENTRY]] ], [ [[SNEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SNEXT]] = getelementptr inbounds i8, ptr [[IDX]], i64 1 +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[BE:%.*]] = icmp ult ptr [[SNEXT]], [[P_42]] +; CHECK-NEXT: br i1 [[BE]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + entry: + %t.ptr = inttoptr i8 %t to ptr + %p.42 = inttoptr i8 42 to ptr + %cmp1 = icmp slt ptr %t.ptr, %p.42 + call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ] + br label %loop + + loop: + %idx = phi ptr [ %t.ptr, %entry ], [ %snext, %loop ] + %snext = getelementptr inbounds i8, ptr %idx, i64 1 + %c = icmp slt ptr %idx, %p.42 + call void @use(i1 %c) + %be = icmp slt ptr %snext, %p.42 + br i1 %be, label %loop, label %exit + + exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll index 0f3db228e9cfe..8da4f0c456b6b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll @@ -26,7 +26,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en %next19.i.i = getelementptr inbounds %struct.CvNode1D, ptr %dst, i32 %i.1424.i.i, i32 1 store ptr %dst, ptr %next19.i.i, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -52,7 +52,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en %val.i.i = getelementptr inbounds %struct.CvNode1D2, ptr %arrayidx15.i.i1427, i32 0, i32 1 store double 0xC415AF1D80000000, ptr %val.i.i, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -79,7 +79,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en store double %load_d, ptr %dst.ptr, align 4 store ptr %load_p, ptr %dst.ptr.1, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i @@ -107,7 +107,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en store double %load_d, ptr %dst.ptr, align 4 store ptr %load_p, ptr %dst.ptr.1, align 4 %inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1 - %exitcond438.i.i = icmp eq i32 %inc21.i.i, 0 + %exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000 br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i for.end22.i.i: ; preds = %for.body14.i.i diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 5cc00daab7ce5..37c489cd0d4cf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -10,10 +10,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -31,7 +31,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]] ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) @@ -42,12 +42,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]] ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP15]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 @@ -71,8 +71,29 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP27]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -89,7 +110,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -211,3 +232,13 @@ while.end.loopexit: ; preds = %while.body attributes #0 = { vscale_range(1,16) "target-features"="+sve" } attributes #1 = { "target-cpu"="apple-m1" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll index 74db8683d5df8..5c7ea8efa7ed7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -9,7 +9,7 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 { ; CHECK-LABEL: define i32 @dotp_z_s( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -36,17 +36,38 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-NOI8MM-LABEL: define i32 @dotp_z_s( ; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NOI8MM-NEXT: entry: -; CHECK-NOI8MM-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NOI8MM-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: @@ -73,12 +94,33 @@ define i32 @dotp_z_s(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NOI8MM-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NOI8MM-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOI8MM: for.body: +; CHECK-NOI8MM-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NOI8MM-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NOI8MM-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-NOI8MM-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NOI8MM-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NOI8MM-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-NOI8MM-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NOI8MM-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NOI8MM-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NOI8MM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NOI8MM-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -95,7 +137,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -104,9 +146,9 @@ for.exit: ; preds = %for.body define i32 @dotp_s_z(ptr %a, ptr %b) #0 { ; CHECK-LABEL: define i32 @dotp_s_z( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -133,17 +175,38 @@ define i32 @dotp_s_z(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-NOI8MM-LABEL: define i32 @dotp_s_z( -; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-NOI8MM-NEXT: entry: -; CHECK-NOI8MM-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NOI8MM-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: @@ -170,12 +233,33 @@ define i32 @dotp_s_z(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-NOI8MM-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NOI8MM-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NOI8MM-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NOI8MM-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NOI8MM: for.body: +; CHECK-NOI8MM-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-NOI8MM-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-NOI8MM-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-NOI8MM-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-NOI8MM-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-NOI8MM-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-NOI8MM-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-NOI8MM-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-NOI8MM-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NOI8MM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NOI8MM-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -192,7 +276,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -204,3 +288,18 @@ for.exit: ; preds = %for.body !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !10 = !{!"llvm.loop.vectorize.enable", i1 true} attributes #0 = { vscale_range(1,16) "target-features"="+sve" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. +; CHECK-NOI8MM: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-NOI8MM: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-NOI8MM: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-NOI8MM: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-NOI8MM: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-NOI8MM: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index c66695f1b50f0..97a5801d88108 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -10,7 +10,7 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -28,16 +28,37 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -64,17 +85,38 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]]) ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -92,11 +134,32 @@ define i32 @dotp(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -113,7 +176,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -124,7 +187,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -202,13 +265,37 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] ; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -357,13 +444,38 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] ; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP140]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP140]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -441,8 +553,32 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] ; CHECK-MAXBW-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -459,7 +595,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -470,7 +606,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -489,13 +625,38 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -514,13 +675,38 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -539,8 +725,33 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -557,7 +768,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -568,7 +779,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -586,13 +797,38 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -610,13 +846,38 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -634,8 +895,33 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15 +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -652,7 +938,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %ext.b %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -729,6 +1015,66 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX14]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX16]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -799,6 +1145,66 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX14]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX16]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-INTERLEAVED-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-INTERLEAVED-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( ; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -869,6 +1275,66 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP34]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP36]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX14]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX16]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-MAXBW-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-MAXBW-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-MAXBW-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-MAXBW-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-MAXBW-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-MAXBW-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-MAXBW-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-MAXBW-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-MAXBW-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-MAXBW-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-MAXBW-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-MAXBW-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-MAXBW-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-MAXBW-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-MAXBW-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] ; entry: br label %for.body @@ -956,6 +1422,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -997,6 +1484,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1028,6 +1536,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1087,6 +1616,325 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-INTERLEAVE1: pred.load.if: +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-INTERLEAVE1: pred.load.continue: +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-INTERLEAVE1: pred.load.if1: +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-INTERLEAVE1: pred.load.continue2: +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-INTERLEAVE1: pred.load.if3: +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-INTERLEAVE1: pred.load.continue4: +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-INTERLEAVE1: pred.load.if5: +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-INTERLEAVE1: pred.load.continue6: +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; CHECK-INTERLEAVE1: pred.load.if7: +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; CHECK-INTERLEAVE1: pred.load.continue8: +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; CHECK-INTERLEAVE1: pred.load.if9: +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; CHECK-INTERLEAVE1: pred.load.continue10: +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK-INTERLEAVE1: pred.load.if11: +; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; CHECK-INTERLEAVE1: pred.load.continue12: +; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; CHECK-INTERLEAVE1: pred.load.if13: +; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; CHECK-INTERLEAVE1: pred.load.continue14: +; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK-INTERLEAVE1: pred.load.if15: +; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; CHECK-INTERLEAVE1: pred.load.continue16: +; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; CHECK-INTERLEAVE1: pred.load.if17: +; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; CHECK-INTERLEAVE1: pred.load.continue18: +; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; CHECK-INTERLEAVE1: pred.load.if19: +; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; CHECK-INTERLEAVE1: pred.load.continue20: +; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; CHECK-INTERLEAVE1: pred.load.if21: +; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE22]] +; CHECK-INTERLEAVE1: pred.load.continue22: +; CHECK-INTERLEAVE1-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; CHECK-INTERLEAVE1: pred.load.if23: +; CHECK-INTERLEAVE1-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE24]] +; CHECK-INTERLEAVE1: pred.load.continue24: +; CHECK-INTERLEAVE1-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK-INTERLEAVE1: pred.load.if25: +; CHECK-INTERLEAVE1-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; CHECK-INTERLEAVE1: pred.load.continue26: +; CHECK-INTERLEAVE1-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; CHECK-INTERLEAVE1: pred.load.if27: +; CHECK-INTERLEAVE1-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; CHECK-INTERLEAVE1: pred.load.continue28: +; CHECK-INTERLEAVE1-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVE1: pred.load.if29: +; CHECK-INTERLEAVE1-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; CHECK-INTERLEAVE1: pred.load.continue30: +; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; CHECK-INTERLEAVE1: pred.load.if31: +; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE32]] +; CHECK-INTERLEAVE1: pred.load.continue32: +; CHECK-INTERLEAVE1-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] +; CHECK-INTERLEAVE1: pred.load.if33: +; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE34]] +; CHECK-INTERLEAVE1: pred.load.continue34: +; CHECK-INTERLEAVE1-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] +; CHECK-INTERLEAVE1: pred.load.if35: +; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE36]] +; CHECK-INTERLEAVE1: pred.load.continue36: +; CHECK-INTERLEAVE1-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] +; CHECK-INTERLEAVE1: pred.load.if37: +; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE38]] +; CHECK-INTERLEAVE1: pred.load.continue38: +; CHECK-INTERLEAVE1-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] +; CHECK-INTERLEAVE1: pred.load.if39: +; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE40]] +; CHECK-INTERLEAVE1: pred.load.continue40: +; CHECK-INTERLEAVE1-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] +; CHECK-INTERLEAVE1: pred.load.if41: +; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE42]] +; CHECK-INTERLEAVE1: pred.load.continue42: +; CHECK-INTERLEAVE1-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] +; CHECK-INTERLEAVE1: pred.load.if43: +; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE44]] +; CHECK-INTERLEAVE1: pred.load.continue44: +; CHECK-INTERLEAVE1-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] +; CHECK-INTERLEAVE1: pred.load.if45: +; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE46]] +; CHECK-INTERLEAVE1: pred.load.continue46: +; CHECK-INTERLEAVE1-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] +; CHECK-INTERLEAVE1: pred.load.if47: +; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE48]] +; CHECK-INTERLEAVE1: pred.load.continue48: +; CHECK-INTERLEAVE1-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] +; CHECK-INTERLEAVE1: pred.load.if49: +; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE50]] +; CHECK-INTERLEAVE1: pred.load.continue50: +; CHECK-INTERLEAVE1-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] +; CHECK-INTERLEAVE1: pred.load.if51: +; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE52]] +; CHECK-INTERLEAVE1: pred.load.continue52: +; CHECK-INTERLEAVE1-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] +; CHECK-INTERLEAVE1: pred.load.if53: +; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE54]] +; CHECK-INTERLEAVE1: pred.load.continue54: +; CHECK-INTERLEAVE1-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] +; CHECK-INTERLEAVE1: pred.load.if55: +; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE56]] +; CHECK-INTERLEAVE1: pred.load.continue56: +; CHECK-INTERLEAVE1-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] +; CHECK-INTERLEAVE1: pred.load.if57: +; CHECK-INTERLEAVE1-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE58]] +; CHECK-INTERLEAVE1: pred.load.continue58: +; CHECK-INTERLEAVE1-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] +; CHECK-INTERLEAVE1: pred.load.if59: +; CHECK-INTERLEAVE1-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE60]] +; CHECK-INTERLEAVE1: pred.load.continue60: +; CHECK-INTERLEAVE1-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] +; CHECK-INTERLEAVE1: pred.load.if61: +; CHECK-INTERLEAVE1-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] +; CHECK-INTERLEAVE1-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE62]] +; CHECK-INTERLEAVE1: pred.load.continue62: +; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] +; CHECK-INTERLEAVE1-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; CHECK-INTERLEAVE1-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP183]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP183]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1123,6 +1971,325 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-INTERLEAVED: pred.load.if: +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-INTERLEAVED: pred.load.continue: +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-INTERLEAVED: pred.load.if1: +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-INTERLEAVED: pred.load.continue2: +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-INTERLEAVED: pred.load.if3: +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-INTERLEAVED: pred.load.continue4: +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-INTERLEAVED: pred.load.if5: +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-INTERLEAVED: pred.load.continue6: +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; CHECK-INTERLEAVED: pred.load.if7: +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; CHECK-INTERLEAVED: pred.load.continue8: +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; CHECK-INTERLEAVED: pred.load.if9: +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; CHECK-INTERLEAVED: pred.load.continue10: +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK-INTERLEAVED: pred.load.if11: +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; CHECK-INTERLEAVED: pred.load.continue12: +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; CHECK-INTERLEAVED: pred.load.if13: +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; CHECK-INTERLEAVED: pred.load.continue14: +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK-INTERLEAVED: pred.load.if15: +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; CHECK-INTERLEAVED: pred.load.continue16: +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; CHECK-INTERLEAVED: pred.load.if17: +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; CHECK-INTERLEAVED: pred.load.continue18: +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; CHECK-INTERLEAVED: pred.load.if19: +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; CHECK-INTERLEAVED: pred.load.continue20: +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; CHECK-INTERLEAVED: pred.load.if21: +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE22]] +; CHECK-INTERLEAVED: pred.load.continue22: +; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; CHECK-INTERLEAVED: pred.load.if23: +; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE24]] +; CHECK-INTERLEAVED: pred.load.continue24: +; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK-INTERLEAVED: pred.load.if25: +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; CHECK-INTERLEAVED: pred.load.continue26: +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; CHECK-INTERLEAVED: pred.load.if27: +; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; CHECK-INTERLEAVED: pred.load.continue28: +; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVED: pred.load.if29: +; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; CHECK-INTERLEAVED: pred.load.continue30: +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; CHECK-INTERLEAVED: pred.load.if31: +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE32]] +; CHECK-INTERLEAVED: pred.load.continue32: +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] +; CHECK-INTERLEAVED: pred.load.if33: +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE34]] +; CHECK-INTERLEAVED: pred.load.continue34: +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] +; CHECK-INTERLEAVED: pred.load.if35: +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE36]] +; CHECK-INTERLEAVED: pred.load.continue36: +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] +; CHECK-INTERLEAVED: pred.load.if37: +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE38]] +; CHECK-INTERLEAVED: pred.load.continue38: +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] +; CHECK-INTERLEAVED: pred.load.if39: +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE40]] +; CHECK-INTERLEAVED: pred.load.continue40: +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] +; CHECK-INTERLEAVED: pred.load.if41: +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE42]] +; CHECK-INTERLEAVED: pred.load.continue42: +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] +; CHECK-INTERLEAVED: pred.load.if43: +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE44]] +; CHECK-INTERLEAVED: pred.load.continue44: +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] +; CHECK-INTERLEAVED: pred.load.if45: +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE46]] +; CHECK-INTERLEAVED: pred.load.continue46: +; CHECK-INTERLEAVED-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] +; CHECK-INTERLEAVED: pred.load.if47: +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE48]] +; CHECK-INTERLEAVED: pred.load.continue48: +; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] +; CHECK-INTERLEAVED: pred.load.if49: +; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE50]] +; CHECK-INTERLEAVED: pred.load.continue50: +; CHECK-INTERLEAVED-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] +; CHECK-INTERLEAVED: pred.load.if51: +; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE52]] +; CHECK-INTERLEAVED: pred.load.continue52: +; CHECK-INTERLEAVED-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] +; CHECK-INTERLEAVED: pred.load.if53: +; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE54]] +; CHECK-INTERLEAVED: pred.load.continue54: +; CHECK-INTERLEAVED-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] +; CHECK-INTERLEAVED: pred.load.if55: +; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE56]] +; CHECK-INTERLEAVED: pred.load.continue56: +; CHECK-INTERLEAVED-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] +; CHECK-INTERLEAVED: pred.load.if57: +; CHECK-INTERLEAVED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE58]] +; CHECK-INTERLEAVED: pred.load.continue58: +; CHECK-INTERLEAVED-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] +; CHECK-INTERLEAVED: pred.load.if59: +; CHECK-INTERLEAVED-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE60]] +; CHECK-INTERLEAVED: pred.load.continue60: +; CHECK-INTERLEAVED-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] +; CHECK-INTERLEAVED: pred.load.if61: +; CHECK-INTERLEAVED-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE62]] +; CHECK-INTERLEAVED: pred.load.continue62: +; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] +; CHECK-INTERLEAVED-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; CHECK-INTERLEAVED-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP183]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP183]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1159,6 +2326,325 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-MAXBW: pred.load.if: +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK-MAXBW: pred.load.continue: +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK-MAXBW: pred.load.if1: +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK-MAXBW: pred.load.continue2: +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-MAXBW-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK-MAXBW: pred.load.if3: +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK-MAXBW: pred.load.continue4: +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; CHECK-MAXBW: pred.load.if5: +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK-MAXBW: pred.load.continue6: +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-MAXBW-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; CHECK-MAXBW: pred.load.if7: +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; CHECK-MAXBW: pred.load.continue8: +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-MAXBW-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; CHECK-MAXBW: pred.load.if9: +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; CHECK-MAXBW: pred.load.continue10: +; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-MAXBW-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK-MAXBW: pred.load.if11: +; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; CHECK-MAXBW: pred.load.continue12: +; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-MAXBW-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; CHECK-MAXBW: pred.load.if13: +; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 +; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; CHECK-MAXBW: pred.load.continue14: +; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-MAXBW-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK-MAXBW: pred.load.if15: +; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 +; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; CHECK-MAXBW: pred.load.continue16: +; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-MAXBW-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; CHECK-MAXBW: pred.load.if17: +; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 +; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; CHECK-MAXBW: pred.load.continue18: +; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-MAXBW-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; CHECK-MAXBW: pred.load.if19: +; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 +; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; CHECK-MAXBW: pred.load.continue20: +; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-MAXBW-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-MAXBW-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; CHECK-MAXBW: pred.load.if21: +; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 +; CHECK-MAXBW-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE22]] +; CHECK-MAXBW: pred.load.continue22: +; CHECK-MAXBW-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-MAXBW-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-MAXBW-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; CHECK-MAXBW: pred.load.if23: +; CHECK-MAXBW-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 +; CHECK-MAXBW-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE24]] +; CHECK-MAXBW: pred.load.continue24: +; CHECK-MAXBW-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-MAXBW-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-MAXBW-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK-MAXBW: pred.load.if25: +; CHECK-MAXBW-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 +; CHECK-MAXBW-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; CHECK-MAXBW: pred.load.continue26: +; CHECK-MAXBW-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-MAXBW-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-MAXBW-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; CHECK-MAXBW: pred.load.if27: +; CHECK-MAXBW-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 +; CHECK-MAXBW-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; CHECK-MAXBW: pred.load.continue28: +; CHECK-MAXBW-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-MAXBW-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-MAXBW-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-MAXBW: pred.load.if29: +; CHECK-MAXBW-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 +; CHECK-MAXBW-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; CHECK-MAXBW: pred.load.continue30: +; CHECK-MAXBW-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-MAXBW-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; CHECK-MAXBW: pred.load.if31: +; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE32]] +; CHECK-MAXBW: pred.load.continue32: +; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] +; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 +; CHECK-MAXBW-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] +; CHECK-MAXBW: pred.load.if33: +; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE34]] +; CHECK-MAXBW: pred.load.continue34: +; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] +; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 +; CHECK-MAXBW-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] +; CHECK-MAXBW: pred.load.if35: +; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 +; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE36]] +; CHECK-MAXBW: pred.load.continue36: +; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] +; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 +; CHECK-MAXBW-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] +; CHECK-MAXBW: pred.load.if37: +; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 +; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE38]] +; CHECK-MAXBW: pred.load.continue38: +; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] +; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 +; CHECK-MAXBW-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] +; CHECK-MAXBW: pred.load.if39: +; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE40]] +; CHECK-MAXBW: pred.load.continue40: +; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] +; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 +; CHECK-MAXBW-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] +; CHECK-MAXBW: pred.load.if41: +; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE42]] +; CHECK-MAXBW: pred.load.continue42: +; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] +; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 +; CHECK-MAXBW-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] +; CHECK-MAXBW: pred.load.if43: +; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 +; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE44]] +; CHECK-MAXBW: pred.load.continue44: +; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] +; CHECK-MAXBW-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 +; CHECK-MAXBW-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] +; CHECK-MAXBW: pred.load.if45: +; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 +; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE46]] +; CHECK-MAXBW: pred.load.continue46: +; CHECK-MAXBW-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] +; CHECK-MAXBW-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 +; CHECK-MAXBW-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] +; CHECK-MAXBW: pred.load.if47: +; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-MAXBW-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE48]] +; CHECK-MAXBW: pred.load.continue48: +; CHECK-MAXBW-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] +; CHECK-MAXBW-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 +; CHECK-MAXBW-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] +; CHECK-MAXBW: pred.load.if49: +; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-MAXBW-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE50]] +; CHECK-MAXBW: pred.load.continue50: +; CHECK-MAXBW-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] +; CHECK-MAXBW-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 +; CHECK-MAXBW-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] +; CHECK-MAXBW: pred.load.if51: +; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 +; CHECK-MAXBW-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE52]] +; CHECK-MAXBW: pred.load.continue52: +; CHECK-MAXBW-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] +; CHECK-MAXBW-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 +; CHECK-MAXBW-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] +; CHECK-MAXBW: pred.load.if53: +; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 +; CHECK-MAXBW-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE54]] +; CHECK-MAXBW: pred.load.continue54: +; CHECK-MAXBW-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] +; CHECK-MAXBW-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 +; CHECK-MAXBW-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] +; CHECK-MAXBW: pred.load.if55: +; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 +; CHECK-MAXBW-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE56]] +; CHECK-MAXBW: pred.load.continue56: +; CHECK-MAXBW-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] +; CHECK-MAXBW-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 +; CHECK-MAXBW-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] +; CHECK-MAXBW: pred.load.if57: +; CHECK-MAXBW-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 +; CHECK-MAXBW-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE58]] +; CHECK-MAXBW: pred.load.continue58: +; CHECK-MAXBW-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] +; CHECK-MAXBW-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 +; CHECK-MAXBW-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] +; CHECK-MAXBW: pred.load.if59: +; CHECK-MAXBW-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 +; CHECK-MAXBW-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE60]] +; CHECK-MAXBW: pred.load.continue60: +; CHECK-MAXBW-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] +; CHECK-MAXBW-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 +; CHECK-MAXBW-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] +; CHECK-MAXBW: pred.load.if61: +; CHECK-MAXBW-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] +; CHECK-MAXBW-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 +; CHECK-MAXBW-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 +; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE62]] +; CHECK-MAXBW: pred.load.continue62: +; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-MAXBW-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] +; CHECK-MAXBW-NEXT: [[TMP180]] = add <16 x i32> [[TMP179]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[TMP181:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP180]], <16 x i32> [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) +; CHECK-MAXBW-NEXT: [[TMP182:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP182]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP183:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP181]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP183]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP183]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1186,7 +2672,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -1204,14 +2690,14 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 ; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: -; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: for.body: @@ -1226,7 +2712,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: ; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] @@ -1237,7 +2723,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -1264,7 +2750,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] ; CHECK-INTERLEAVED-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]] @@ -1272,7 +2758,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15 ; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: -; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVED: for.body: @@ -1287,7 +2773,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] @@ -1298,7 +2784,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -1316,14 +2802,14 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 ; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: -; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-MAXBW: for.body: @@ -1338,7 +2824,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-MAXBW: for.exit: ; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] @@ -1361,7 +2847,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -1373,3 +2859,64 @@ for.exit: ; preds = %for.body !8 = !{!"llvm.loop.mustprogress"} !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !10 = !{!"llvm.loop.vectorize.enable", i1 true} +;. +; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +;. +; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +;. +; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index af2a7b966f700..a0214ae88c2d6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -12,12 +12,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]] @@ -40,7 +40,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] ; CHECK-INTERLEAVE1: scalar.ph: ; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] @@ -58,7 +58,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ] @@ -69,12 +69,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]] @@ -111,7 +111,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] ; CHECK-INTERLEAVED: scalar.ph: ; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] @@ -129,7 +129,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ] @@ -140,12 +140,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] @@ -166,6 +166,31 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE5]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -182,7 +207,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -193,7 +218,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: @@ -271,13 +296,13 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]] ; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) ; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: -; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] ; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: for.body: @@ -292,7 +317,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ] @@ -301,7 +326,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: @@ -450,17 +475,38 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]] ; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] ; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP142]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP142]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: @@ -538,8 +584,32 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]] ; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -556,7 +626,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -569,12 +639,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -600,18 +670,50 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = extractelement [[TMP18]], i32 [[TMP22]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = sub i32 [[TMP25]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP16]], i32 [[TMP26]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -648,18 +750,50 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement [[TMP27]], i32 [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul i32 [[TMP33]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP25]], i32 [[TMP35]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -685,6 +819,38 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 8 +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = extractelement [[TMP27]], i32 [[TMP22]] +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = mul i32 [[TMP24]], 8 +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP25]], i32 [[TMP31]] +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -701,7 +867,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -714,12 +880,12 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -744,18 +910,50 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sub i32 [[TMP20]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = extractelement [[TMP17]], i32 [[TMP21]] +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul i32 [[TMP23]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sub i32 [[TMP24]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP17]], i32 [[TMP25]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -784,18 +982,50 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul i32 [[TMP23]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sub i32 [[TMP31]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = extractelement [[TMP21]], i32 [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP21]], i32 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() @@ -820,6 +1050,38 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 8 +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sub i32 [[TMP27]], 1 +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = extractelement [[TMP21]], i32 [[TMP28]] +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul i32 [[TMP23]], 8 +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sub i32 [[TMP24]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP21]], i32 [[TMP25]] +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -836,7 +1098,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %ext.b %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -912,6 +1174,73 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP41]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP35]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP30]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP23]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP43]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i32 [ [[TMP44]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX12:%.*]] = phi i32 [ [[TMP45]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP46]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX11]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX12]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP46]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP44]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP43]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1033,6 +1362,77 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]] ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP81]], [[TMP80]] +; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX23:%.*]] = add [[TMP65]], [[TMP64]] +; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX23]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX24:%.*]] = add [[TMP49]], [[TMP48]] +; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX24]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX25:%.*]] = add [[TMP33]], [[TMP50]] +; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX25]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP83]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX26:%.*]] = phi i32 [ [[TMP84]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX27:%.*]] = phi i32 [ [[TMP85]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX28:%.*]] = phi i32 [ [[TMP86]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX26]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX27]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX28]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-INTERLEAVED-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVED-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-INTERLEAVED-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP84]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP83]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-INTERLEAVED-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled( ; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1102,6 +1502,73 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE16]]) +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE17]]) +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE11]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP39]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX14:%.*]] = phi i32 [ [[TMP40]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX15:%.*]] = phi i32 [ [[TMP41]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i32 [ [[TMP42]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM3:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD_A3:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM2:%.*]] = phi i32 [ [[BC_MERGE_RDX14]], [[SCALAR_PH]] ], [ [[ADD_A2:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM1:%.*]] = phi i32 [ [[BC_MERGE_RDX15]], [[SCALAR_PH]] ], [ [[ADD_A1:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM0:%.*]] = phi i32 [ [[BC_MERGE_RDX16]], [[SCALAR_PH]] ], [ [[ADD_A0:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[GEP_B0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[OFFSET_1:%.*]] = or disjoint i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[GEP_A1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_1]] +; CHECK-MAXBW-NEXT: [[GEP_B1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_1]] +; CHECK-MAXBW-NEXT: [[OFFSET_2:%.*]] = or disjoint i64 [[IV]], 2 +; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_2]] +; CHECK-MAXBW-NEXT: [[GEP_B2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_2]] +; CHECK-MAXBW-NEXT: [[OFFSET_3:%.*]] = or disjoint i64 [[IV]], 3 +; CHECK-MAXBW-NEXT: [[GEP_A3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[OFFSET_3]] +; CHECK-MAXBW-NEXT: [[GEP_B3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[OFFSET_3]] +; CHECK-MAXBW-NEXT: [[LOAD_A0:%.*]] = load i8, ptr [[GEP_A0]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A0:%.*]] = sext i8 [[LOAD_A0]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B0:%.*]] = load i8, ptr [[GEP_B0]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B0:%.*]] = sext i8 [[LOAD_B0]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A0:%.*]] = mul nsw i32 [[EXT_B0]], [[EXT_A0]] +; CHECK-MAXBW-NEXT: [[ADD_A0]] = add nsw i32 [[MUL_A0]], [[ACCUM0]] +; CHECK-MAXBW-NEXT: [[LOAD_A1:%.*]] = load i8, ptr [[GEP_A1]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A1:%.*]] = sext i8 [[LOAD_A1]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A1:%.*]] = mul nsw i32 [[EXT_A1]], [[EXT_B1]] +; CHECK-MAXBW-NEXT: [[ADD_A1]] = add nsw i32 [[MUL_A1]], [[ACCUM1]] +; CHECK-MAXBW-NEXT: [[LOAD_A2:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A2:%.*]] = sext i8 [[LOAD_A2]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B2:%.*]] = load i8, ptr [[GEP_B2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B2:%.*]] = sext i8 [[LOAD_B2]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A2:%.*]] = mul nsw i32 [[EXT_A2]], [[EXT_B2]] +; CHECK-MAXBW-NEXT: [[ADD_A2]] = add nsw i32 [[MUL_A2]], [[ACCUM2]] +; CHECK-MAXBW-NEXT: [[LOAD_A3:%.*]] = load i8, ptr [[GEP_A3]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A3:%.*]] = sext i8 [[LOAD_A3]] to i32 +; CHECK-MAXBW-NEXT: [[LOAD_B3:%.*]] = load i8, ptr [[GEP_B3]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B3:%.*]] = sext i8 [[LOAD_B3]] to i32 +; CHECK-MAXBW-NEXT: [[MUL_A3:%.*]] = mul nsw i32 [[EXT_A3]], [[EXT_B3]] +; CHECK-MAXBW-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A2_LCSSA:%.*]] = phi i32 [ [[ADD_A2]], [[FOR_BODY]] ], [ [[TMP40]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_A3_LCSSA:%.*]] = phi i32 [ [[ADD_A3]], [[FOR_BODY]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[RESULT0:%.*]] = add nsw i32 [[ADD_A0_LCSSA]], [[ADD_A1_LCSSA]] +; CHECK-MAXBW-NEXT: [[RESULT1:%.*]] = add nsw i32 [[ADD_A2_LCSSA]], [[ADD_A3_LCSSA]] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add nsw i32 [[RESULT0]], [[RESULT1]] +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] ; entry: br label %for.body @@ -1195,6 +1662,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated( ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1246,6 +1734,27 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated( ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1279,6 +1788,31 @@ define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE5]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1344,6 +1878,30 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma( ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1386,6 +1944,30 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma( ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1428,6 +2010,30 @@ define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 ; CHECK-MAXBW-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) +; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_A2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_A2]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nsw i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1457,12 +2063,12 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP11]], 4 ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1483,18 +2089,49 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement [[TMP12]], i32 [[TMP19]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP14]], 8 ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1528,18 +2165,50 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = extractelement [[TMP20]], i32 [[TMP29]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP26]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-MAXBW-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1560,6 +2229,37 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP24]]) +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], 8 +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 [[TMP19]] +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]] +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] ; entry: br label %for.body @@ -1576,7 +2276,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body @@ -1618,6 +2318,31 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP15]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[TMP18]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[CONV3:%.*]] = zext i8 [[TMP19]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]] +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1665,6 +2390,32 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP25]], [[TMP24]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]] +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[TMP28]] to i64 +; CHECK-INTERLEAVED-NEXT: [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]] +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[CONV3:%.*]] = zext i8 [[TMP29]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]] +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1703,6 +2454,27 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[I_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[I_IV]] +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[TMP17]] to i64 +; CHECK-MAXBW-NEXT: [[I_IV_NEXT]] = add nuw nsw i64 [[I_IV]], 1 +; CHECK-MAXBW-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[I_IV_NEXT]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-MAXBW-NEXT: [[CONV3:%.*]] = zext i8 [[TMP18]] to i64 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]] +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]] ; entry: br label %for.body @@ -1756,6 +2528,14 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float +; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ] +; CHECK-INTERLEAVE1-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-INTERLEAVE1-NEXT: ret void ; ; CHECK-INTERLEAVED-LABEL: define void @not_dotp_not_phi2( ; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -1809,6 +2589,41 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[MATRIX]], [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ADD_1:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[PTR]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[GEP_B1:%.*]] = getelementptr i8, ptr [[PTR]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[B_EXT:%.*]] = sext i8 [[LOAD_B]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nsw i32 [[A_EXT]], [[B_EXT]] +; CHECK-INTERLEAVED-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B1:%.*]] = load i8, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[B_EXT1:%.*]] = sext i8 [[LOAD_B1]] to i32 +; CHECK-INTERLEAVED-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[A_EXT1]], [[B_EXT1]] +; CHECK-INTERLEAVED-NEXT: [[ADD_1]] = add i32 [[MUL_1]], [[ADD]] +; CHECK-INTERLEAVED-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float +; CHECK-INTERLEAVED-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ] +; CHECK-INTERLEAVED-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-INTERLEAVED-NEXT: ret void ; ; CHECK-MAXBW-LABEL: define void @not_dotp_not_phi2( ; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -1839,6 +2654,14 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] ; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float +; CHECK-MAXBW-NEXT: br label [[EXIT]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_FLOAT]], [[FOR_EXIT]] ] +; CHECK-MAXBW-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-MAXBW-NEXT: ret void ; entry: %cmp = icmp sgt i32 %n, 0 @@ -1883,7 +2706,7 @@ exit: ; preds = %for.exit, %entry define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -1909,14 +2732,35 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVE1: exit.loopexit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -1948,15 +2792,36 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVED: exit.loopexit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -1988,11 +2853,32 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_A]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-MAXBW: exit.loopexit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: br label [[EXIT]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-MAXBW-NEXT: ret i64 [[RESULT]] ; entry: %cmp = icmp eq i64 %n, 0 @@ -2021,7 +2907,7 @@ exit: ; preds = %for.cond.cleanup.loopexit, %ent define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_outside_plan2( -; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -2047,14 +2933,35 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVE1: exit.loopexit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_outside_plan2( -; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -2086,15 +2993,36 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-INTERLEAVED: exit.loopexit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_outside_plan2( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-SAME: ptr [[A:%.*]], i16 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] @@ -2126,11 +3054,32 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PH]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-MAXBW: exit.loopexit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: br label [[EXIT]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-MAXBW-NEXT: ret i64 [[RESULT]] ; entry: %cmp = icmp eq i64 %n, 0 @@ -2162,3 +3111,84 @@ exit: ; preds = %for.cond.cleanup.loopexit, %ent !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !10 = !{!"llvm.loop.vectorize.enable", i1 true} attributes #0 = { vscale_range(1,16) "target-features"="+sve" } +;. +; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-INTERLEAVE1: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]} +;. +; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-INTERLEAVED: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP26]] = distinct !{[[LOOP26]], [[META2]], [[META1]]} +;. +; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP14]] = distinct !{[[LOOP14]], [[META15:![0-9]+]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[META15]] = !{!"llvm.loop.mustprogress"} +; CHECK-MAXBW: [[LOOP16]] = distinct !{[[LOOP16]], [[META15]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP20]] = distinct !{[[LOOP20]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP22]] = distinct !{[[LOOP22]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP24]] = distinct !{[[LOOP24]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll index f24b115ab9f99..3561f52df9490 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll @@ -8,7 +8,7 @@ define i32 @not_dotp(ptr %a, ptr %b) { ; CHECK-LABEL: define i32 @not_dotp( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -35,7 +35,7 @@ define i32 @not_dotp(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 ; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; entry: @@ -53,7 +53,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1000 br i1 %exitcond.not, label %for.exit, label %for.body for.exit: ; preds = %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index ac054f569e11b..1d4b808a612a0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -148,18 +148,16 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-LABEL: define void @trunc_store( ; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; DEFAULT-NEXT: iter.check: -; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; DEFAULT: vector.memcheck: +; DEFAULT-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1000 ; DEFAULT-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 ; DEFAULT-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] -; DEFAULT-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[DST]] +; DEFAULT-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] ; DEFAULT-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; DEFAULT-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; DEFAULT: vector.main.loop.iter.check: -; DEFAULT-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer @@ -180,46 +178,36 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]] ; DEFAULT-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1, !alias.scope [[META8]], !noalias [[META5]] ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; DEFAULT-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; DEFAULT-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 +; DEFAULT-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; DEFAULT: middle.block: -; DEFAULT-NEXT: br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: -; DEFAULT-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP16]] -; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; DEFAULT: vec.epilog.ph: -; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; DEFAULT-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2 -; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP18]] -; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]] -; DEFAULT-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i16 [[X]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP24:%.*]] = trunc [[BROADCAST_SPLAT7]] to +; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i16> poison, i16 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT4]], <8 x i16> poison, <8 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP15:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8> ; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; DEFAULT: vec.epilog.vector.body: ; DEFAULT-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP21:%.*]] = add i64 [[INDEX5]], 0 -; DEFAULT-NEXT: [[TMP22:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP22]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP23:%.*]] = trunc [[BROADCAST_SPLAT]] to -; DEFAULT-NEXT: [[TMP25:%.*]] = and [[TMP23]], [[TMP24]] +; DEFAULT-NEXT: [[TMP16:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP16]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT7]], <8 x i64> poison, <8 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP18:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8> +; DEFAULT-NEXT: [[TMP14:%.*]] = and <8 x i8> [[TMP18]], [[TMP15]] ; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP21]] ; DEFAULT-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 -; DEFAULT-NEXT: store [[TMP25]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]] -; DEFAULT-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], [[TMP20]] -; DEFAULT-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]] -; DEFAULT-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; DEFAULT-NEXT: store <8 x i8> [[TMP14]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]] +; DEFAULT-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 8 +; DEFAULT-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 1000 +; DEFAULT-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; DEFAULT: vec.epilog.middle.block: -; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]] -; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; DEFAULT-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -230,7 +218,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; DEFAULT-NEXT: store i8 [[TRUNC]], ptr [[GEP]], align 1 ; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; DEFAULT: exit: ; DEFAULT-NEXT: ret void @@ -238,36 +226,49 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-LABEL: define void @trunc_store( ; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; PRED-NEXT: entry: -; PRED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; PRED: vector.memcheck: +; PRED-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1000 ; PRED-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8 ; PRED-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]] -; PRED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[DST]] +; PRED-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]] ; PRED-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; PRED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; PRED: vector.ph: -; PRED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT1]], <16 x i16> poison, <16 x i32> zeroinitializer -; PRED-NEXT: [[TMP3:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT2]] to <16 x i8> +; PRED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP10]], 2 +; PRED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 +; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 1000, [[TMP2]] +; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] +; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; PRED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; PRED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1000) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP11:%.*]] = trunc [[BROADCAST_SPLAT]] to ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; PRED-NEXT: [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]] -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer -; PRED-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT]] to <16 x i8> -; PRED-NEXT: [[TMP4:%.*]] = and <16 x i8> [[TMP2]], [[TMP3]] +; PRED-NEXT: [[TMP7:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]] +; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; PRED-NEXT: [[TMP8:%.*]] = trunc [[BROADCAST_SPLAT3]] to +; PRED-NEXT: [[TMP9:%.*]] = and [[TMP8]], [[TMP11]] ; PRED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]] ; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 -; PRED-NEXT: store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]] -; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; PRED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; PRED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PRED-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP9]], ptr [[TMP6]], i32 1, [[ACTIVE_LANE_MASK]]), !alias.scope [[META7:![0-9]+]], !noalias [[META4]] +; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1000) +; PRED-NEXT: [[TMP12:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) +; PRED-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; PRED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -278,7 +279,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; PRED-NEXT: store i8 [[TRUNC]], ptr [[GEP]], align 1 ; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; PRED: exit: ; PRED-NEXT: ret void @@ -295,7 +296,7 @@ loop: %gep = getelementptr i8, ptr %dst, i64 %iv store i8 %trunc, ptr %gep, align 1 %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1000 br i1 %ec, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll index fb5d513dfbd75..8333c3193d799 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -972,7 +972,7 @@ loop: %red.next = fadd double %for, %red %for.next = sitofp i32 %iv to double %iv.next = add nsw i32 %iv, 1 - %ec = icmp eq i32 %iv.next, 0 + %ec = icmp eq i32 %iv.next, 1024 br i1 %ec, label %exit, label %loop, !llvm.loop !13 exit: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll index 7778f01c58dc3..91dd6e475ec47 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-extractvalue.ll @@ -23,7 +23,7 @@ loop.body: %add = add i64 %a, %b store i64 %add, ptr %addr %iv.next = add nsw i32 %iv, 1 - %cond = icmp ne i32 %iv.next, 0 + %cond = icmp ne i32 %iv.next, 1000 br i1 %cond, label %loop.body, label %exit, !llvm.loop !0 exit: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 5dd9f8ff97cca..ccf8540b4ebf7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -10,7 +10,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: Live-in ir<0> = original trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: Successor(s): vector.ph @@ -42,7 +42,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]> ; CHECK-NEXT: EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<%1> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: @@ -63,7 +63,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a ; CHECK-NEXT: IR %add = add i32 %mul, %accum ; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 -; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 0 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -86,7 +86,7 @@ for.body: ; preds = %for.body, %entry %mul = mul i32 %ext.b, %ext.a %add = add i32 %mul, %accum %iv.next = add i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, 0 + %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %exit, label %for.body exit: diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll index 4966ddd299492..6b201fdf21d21 100644 --- a/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll @@ -8,17 +8,17 @@ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" ; Function Attrs: optsize -define i32 @f() #0 { +define i32 @f(ptr %src) #0 { entry: br label %loop loop: %g.016 = phi i32 [ 0, %entry ], [ %g.1.lcssa, %loop ] %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] - %0 = load i8, ptr undef, align 1 - %g.1.lcssa = add i32 %g.016, undef + %0 = load i8, ptr %src, align 1 + %g.1.lcssa = add i32 %g.016, 1 %iv.next = add nsw i32 %iv, 1 - %exitcond = icmp eq i32 %iv.next, 0 + %exitcond = icmp eq i32 %iv.next, 1000 br i1 %exitcond, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll index 8b47aee6bf389..0c5db437c177d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll @@ -64,7 +64,7 @@ define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-LABEL: define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[EXT_0:%.*]] = sext i8 0 to i32 -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -90,14 +90,14 @@ define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2) ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <2 x i32> [[STEP_ADD4]], splat (i32 2) -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i64> [[TMP11]], [[TMP10]] ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOPS:.*]] ; CHECK: [[LOOPS]]: @@ -111,7 +111,7 @@ define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-NEXT: [[RED_NEXT_V:%.*]] = select i1 [[C]], i64 [[AND]], i64 [[CONV_1]] ; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED_NEXT_V]], [[RED]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOPS]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOPS]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] @@ -132,7 +132,7 @@ loops: %red.next.v = select i1 %c, i64 %and, i64 %conv.1 %red.next = or i64 %red.next.v, %red %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1000 br i1 %ec, label %exit, label %loops exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 5c0aeb526e50c..bd28e28ddff95 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -1159,47 +1159,39 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 { ; CHECK-LABEL: @narrowed_reduction( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP:%.*]] to i32 -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = and <16 x i32> [[VEC_PHI]], splat (i32 1) +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i32> [[VEC_PHI1]], splat (i32 1) -; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i32> [[TMP0]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1> -; CHECK-NEXT: [[TMP6]] = zext <16 x i1> [[TMP4]] to <16 x i32> ; CHECK-NEXT: [[TMP7]] = zext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP9:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i1> ; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i1> -; CHECK-NEXT: [[BIN_RDX:%.*]] = or <16 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[BIN_RDX]]) -; CHECK-NEXT: [[TMP12:%.*]] = zext i1 [[TMP11]] to i32 -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP10]]) +; CHECK-NEXT: [[TMP21:%.*]] = zext i1 [[TMP20]] to i32 +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 17, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OR13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INC:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OR13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[OR:%.*]], [[LOOP1]] ] ; CHECK-NEXT: [[AND:%.*]] = and i32 [[OR13]], 1 ; CHECK-NEXT: [[OR]] = or i32 [[AND]], [[CONV]] ; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 0 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 16 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP1]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[OR_LCSSA]] ; entry: @@ -1212,7 +1204,7 @@ loop: %and = and i32 %or13, 1 %or = or i32 %and, %conv %inc = add i32 %iv, 1 - %ec = icmp eq i32 %iv, 0 + %ec = icmp eq i32 %iv, 16 br i1 %ec, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll index 0686395567cc2..68695a8b1282c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll @@ -352,9 +352,7 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 { ; CHECK-LABEL: define void @drop_zext_nneg( ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[P1:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; CHECK: vector.scevcheck: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -372,12 +370,12 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 { ; CHECK-NEXT: store double [[TMP6]], ptr [[P1]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[BODY:%.*]] ; CHECK: body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[NEXT:%.*]], [[ELSE:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -394,7 +392,7 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 { ; CHECK-NEXT: [[PHI:%.*]] = phi double [ [[TMP9]], [[THEN]] ], [ 0.000000e+00, [[BODY]] ] ; CHECK-NEXT: store double [[PHI]], ptr [[P1]], align 8 ; CHECK-NEXT: [[NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[NEXT]], 0 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[NEXT]], 1024 ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT]], label [[BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -419,7 +417,7 @@ else: %phi = phi double [ %1, %then ], [ 0.000000e+00, %body ] store double %phi, ptr %p1, align 8 %next = add i64 %iv, 1 - %cmp = icmp eq i64 %next, 0 + %cmp = icmp eq i64 %next, 1024 br i1 %cmp, label %exit, label %body exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll index 5c9375eb1d17f..d18d618c6a447 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll @@ -8,78 +8,7 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali ; CHECK-LABEL: define void @test_free_instructions_feeding_geps_for_interleave_groups( ; CHECK-SAME: ptr noalias [[P_INVAR:%.*]], ptr noalias [[DST_1:%.*]], ptr noalias [[DST_2:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] -; CHECK: [[VECTOR_SCEVCHECK]]: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST_1]], i64 8 -; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 0, [[MUL_RESULT]] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult ptr [[TMP1]], [[SCEVGEP]] -; CHECK-NEXT: [[TMP3:%.*]] = or i1 [[TMP2]], [[MUL_OVERFLOW]] -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST_1]], i64 12 -; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = sub i64 0, [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult ptr [[TMP5]], [[SCEVGEP1]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[MUL_OVERFLOW4]] -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[DST_1]], i64 4 -; CHECK-NEXT: [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = sub i64 0, [[MUL_RESULT7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[SCEVGEP5]], i64 [[MUL_RESULT7]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp ult ptr [[TMP9]], [[SCEVGEP5]] -; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW8]] -; CHECK-NEXT: [[MUL9:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT10:%.*]] = extractvalue { i64, i1 } [[MUL9]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW11:%.*]] = extractvalue { i64, i1 } [[MUL9]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = sub i64 0, [[MUL_RESULT10]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST_1]], i64 [[MUL_RESULT10]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp ult ptr [[TMP13]], [[DST_1]] -; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW11]] -; CHECK-NEXT: [[SCEVGEP12:%.*]] = getelementptr i8, ptr [[DST_2]], i64 8 -; CHECK-NEXT: [[MUL13:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT14:%.*]] = extractvalue { i64, i1 } [[MUL13]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW15:%.*]] = extractvalue { i64, i1 } [[MUL13]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = sub i64 0, [[MUL_RESULT14]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SCEVGEP12]], i64 [[MUL_RESULT14]] -; CHECK-NEXT: [[TMP18:%.*]] = icmp ult ptr [[TMP17]], [[SCEVGEP12]] -; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP18]], [[MUL_OVERFLOW15]] -; CHECK-NEXT: [[SCEVGEP16:%.*]] = getelementptr i8, ptr [[DST_2]], i64 12 -; CHECK-NEXT: [[MUL17:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT18:%.*]] = extractvalue { i64, i1 } [[MUL17]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW19:%.*]] = extractvalue { i64, i1 } [[MUL17]], 1 -; CHECK-NEXT: [[TMP20:%.*]] = sub i64 0, [[MUL_RESULT18]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[SCEVGEP16]], i64 [[MUL_RESULT18]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp ult ptr [[TMP21]], [[SCEVGEP16]] -; CHECK-NEXT: [[TMP23:%.*]] = or i1 [[TMP22]], [[MUL_OVERFLOW19]] -; CHECK-NEXT: [[SCEVGEP20:%.*]] = getelementptr i8, ptr [[DST_2]], i64 4 -; CHECK-NEXT: [[MUL21:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT22:%.*]] = extractvalue { i64, i1 } [[MUL21]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW23:%.*]] = extractvalue { i64, i1 } [[MUL21]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = sub i64 0, [[MUL_RESULT22]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[SCEVGEP20]], i64 [[MUL_RESULT22]] -; CHECK-NEXT: [[TMP26:%.*]] = icmp ult ptr [[TMP25]], [[SCEVGEP20]] -; CHECK-NEXT: [[TMP27:%.*]] = or i1 [[TMP26]], [[MUL_OVERFLOW23]] -; CHECK-NEXT: [[MUL24:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 -1) -; CHECK-NEXT: [[MUL_RESULT25:%.*]] = extractvalue { i64, i1 } [[MUL24]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW26:%.*]] = extractvalue { i64, i1 } [[MUL24]], 1 -; CHECK-NEXT: [[TMP28:%.*]] = sub i64 0, [[MUL_RESULT25]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[DST_2]], i64 [[MUL_RESULT25]] -; CHECK-NEXT: [[TMP30:%.*]] = icmp ult ptr [[TMP29]], [[DST_2]] -; CHECK-NEXT: [[TMP31:%.*]] = or i1 [[TMP30]], [[MUL_OVERFLOW26]] -; CHECK-NEXT: [[TMP32:%.*]] = or i1 [[TMP3]], [[TMP7]] -; CHECK-NEXT: [[TMP33:%.*]] = or i1 [[TMP32]], [[TMP11]] -; CHECK-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP15]] -; CHECK-NEXT: [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP19]] -; CHECK-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP23]] -; CHECK-NEXT: [[TMP37:%.*]] = or i1 [[TMP36]], [[TMP27]] -; CHECK-NEXT: [[TMP38:%.*]] = or i1 [[TMP37]], [[TMP31]] -; CHECK-NEXT: br i1 [[TMP38]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -106,12 +35,12 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali ; CHECK-NEXT: [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC31]], ptr [[TMP49]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP53]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -139,7 +68,7 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali ; CHECK-NEXT: [[GEP_DST_276:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[ADD_3]] ; CHECK-NEXT: store float 0.000000e+00, ptr [[GEP_DST_276]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 ; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void @@ -173,7 +102,7 @@ loop: %gep.dst.276 = getelementptr float, ptr %dst.2, i64 %add.3 store float 0.000000e+00, ptr %gep.dst.276, align 4 %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1024 br i1 %ec, label %exit, label %loop exit: @@ -771,7 +700,7 @@ attributes #1 = { "min-legal-vector-width"="0" "target-cpu"="cascadelake" } ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} ; CHECK: [[META6]] = !{[[META7:![0-9]+]]} diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll index 95c74d19dd2db..b4c2a4ae79577 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -358,8 +358,8 @@ define float @PR35538_more_FMF(ptr nocapture readonly %a, i32 %N) #0 { ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP7:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] -; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP6]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]] -; CHECK-NEXT: [[TMP9]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] +; CHECK-NEXT: [[TMP8]] = select nnan ninf <4 x i1> [[TMP6]], <4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]] +; CHECK-NEXT: [[TMP9]] = select nnan ninf <4 x i1> [[TMP7]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll index cfae26a3a4257..4a371af87d67d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll @@ -11,7 +11,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[C:%.*]] = icmp ult i8 -68, -69 ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[X]], 0 -; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer @@ -58,12 +58,12 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8 ; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ] @@ -78,7 +78,7 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]] ; CHECK-NEXT: store i64 0, ptr [[GEP1]], align 8 ; CHECK-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT1]], 0 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT1]], 1024 ; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void @@ -103,7 +103,7 @@ loop.latch: %gep = getelementptr i64, ptr %dst, i64 %add store i64 0, ptr %gep, align 8 %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1024 br i1 %ec, label %exit, label %loop.header exit: diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll index c6237170eebb1..54489af8c9f12 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll @@ -114,7 +114,7 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4]] = select fast <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -142,7 +142,7 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI5]], [[WIDE_LOAD6]] -; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[TMP10]], <4 x float> [[VEC_PHI5]], <4 x float> [[WIDE_LOAD6]] +; CHECK-NEXT: [[TMP11]] = select fast <4 x i1> [[TMP10]], <4 x float> [[VEC_PHI5]], <4 x float> [[WIDE_LOAD6]] ; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-vector-trip-count-zero.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-vector-trip-count-zero.ll new file mode 100644 index 0000000000000..b90580638a4cd --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-vector-trip-count-zero.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=2 -epilogue-vectorization-force-VF=2 -S %s | FileCheck %s + +target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" + +; Test case for https://github.com/llvm/llvm-project/issues/122558. +define void @vector_trip_count_0_as_btc_is_all_1(ptr %dst) #0 { +; CHECK-LABEL: define void @vector_trip_count_0_as_btc_is_all_1( +; CHECK-SAME: ptr [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], -1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[IV_NEXT]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.next = add i32 %iv, -1 + %gep = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %iv.next, ptr %gep, align 4 + %ec = icmp eq i32 %iv.next, 0 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll index fc71f8a934047..01a68f01b8097 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll @@ -654,7 +654,7 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) { ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP2]], i32 3 ; CHECK-NEXT: store double [[TMP6]], ptr [[P:%.*]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI10:%.*]] = extractelement <4 x double> [[TMP4]], i32 2 @@ -664,7 +664,7 @@ define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) { ; CHECK-NEXT: phi double [ [[TMP0]], %middle.block ], [ 0.000000e+00, %Entry ] ; CHECK-NEXT: phi double [ [[TMP3]], %middle.block ], [ 0.000000e+00, %Entry ] ; CHECK-NEXT: phi double [ [[VECTOR_RECUR_EXTRACT9]], %middle.block ], [ 0.000000e+00, %Entry ] -; CHECK-NEXT: %bc.resume.val = phi i64 [ 0, %middle.block ], [ 0, %Entry ] +; CHECK-NEXT: %bc.resume.val = phi i64 [ 1000, %middle.block ], [ 0, %Entry ] ; CHECK: End: ; CHECK-NEXT: = phi double [ {{.+}}, %Loop ], [ [[TMP0]], %middle.block ] ; CHECK-NEXT: = phi double [ {{.+}}, %Loop ], [ [[TMP3]], %middle.block ] @@ -684,7 +684,7 @@ Loop: %iv.next= add nuw nsw i64 %iv, 1 %l2 = load double, ptr %b, align 8 store double %div, ptr %p, align 8 - %cond = icmp eq i64 %iv.next, 0 + %cond = icmp eq i64 %iv.next, 1000 br i1 %cond, label %End, label %Loop End: diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll index 7db53d8ffcedf..2a85761da1e52 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -555,29 +555,31 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) { ; UNROLL-LABEL: @minimal_bit_widths_with_aliasing_store( ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] -; UNROLL: for.body: -; UNROLL-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; UNROLL: vector.body: +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ] +; UNROLL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; UNROLL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]] +; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP1]] ; UNROLL-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 +; UNROLL-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1 ; UNROLL-NEXT: store i8 0, ptr [[TMP2]], align 1 +; UNROLL-NEXT: store i8 0, ptr [[TMP4]], align 1 ; UNROLL-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; UNROLL: if.then: -; UNROLL-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 -; UNROLL-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; UNROLL-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 +; UNROLL: pred.store.if: +; UNROLL-NEXT: store i8 [[TMP3]], ptr [[TMP2]], align 1 +; UNROLL-NEXT: store i8 [[TMP5]], ptr [[TMP4]], align 1 ; UNROLL-NEXT: br label [[FOR_INC]] -; UNROLL: for.inc: -; UNROLL-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 -; UNROLL-NEXT: [[TMP7]] = add i64 [[TMP1]], -1 -; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0 -; UNROLL-NEXT: br i1 [[TMP8]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; UNROLL: pred.store.continue2: +; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; UNROLL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; UNROLL-NEXT: br i1 [[TMP6]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; UNROLL: for.end: ; UNROLL-NEXT: ret void ; ; UNROLL-NOSIMPLIFY-LABEL: @minimal_bit_widths_with_aliasing_store( ; UNROLL-NOSIMPLIFY-NEXT: entry: -; UNROLL-NOSIMPLIFY-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NOSIMPLIFY-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: vector.body: @@ -601,13 +603,13 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) { ; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE2]] ; UNROLL-NOSIMPLIFY: pred.store.continue2: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NOSIMPLIFY: scalar.ph: -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 1000, [[ENTRY]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: for.body: ; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -631,24 +633,36 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) { ; ; VEC-LABEL: @minimal_bit_widths_with_aliasing_store( ; VEC-NEXT: entry: +; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C1:%.*]], i64 0 +; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer ; VEC-NEXT: br label [[FOR_BODY:%.*]] -; VEC: for.body: -; VEC-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; VEC-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; VEC: vector.body: +; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP0]] -; VEC-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 -; VEC-NEXT: store i8 0, ptr [[TMP2]], align 1 -; VEC-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; VEC: if.then: -; VEC-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 -; VEC-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8 -; VEC-NEXT: store i8 [[TMP5]], ptr [[TMP2]], align 1 +; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0 +; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1 +; VEC-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP3]], align 1 +; VEC-NEXT: [[C:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 +; VEC-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; VEC: pred.store.if: +; VEC-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP0]] +; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 +; VEC-NEXT: store i8 [[TMP5]], ptr [[TMP4]], align 1 ; VEC-NEXT: br label [[FOR_INC]] -; VEC: for.inc: -; VEC-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 -; VEC-NEXT: [[TMP7]] = add i64 [[TMP1]], -1 -; VEC-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0 -; VEC-NEXT: br i1 [[TMP8]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VEC: pred.store.continue: +; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; VEC-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] +; VEC: pred.store.if1: +; VEC-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 1 +; VEC-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP7]] +; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 +; VEC-NEXT: store i8 [[TMP9]], ptr [[TMP8]], align 1 +; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] +; VEC: pred.store.continue2: +; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; VEC-NEXT: br i1 [[TMP10]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC: for.end: ; VEC-NEXT: ret void ; @@ -657,7 +671,7 @@ entry: for.body: %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] - %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] + %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1000, %entry ] %tmp2 = getelementptr i8, ptr %ptr, i64 %tmp0 %tmp3 = load i8, ptr %tmp2, align 1 store i8 0, ptr %tmp2 diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 96311de673d8a..939709b91062e 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -3031,128 +3031,67 @@ exit: } ; This loop has a backedge taken count of i32_max. We need to check for this -; condition and branch directly to the scalar loop. +; condition and can skip vectorizing. - - -define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable { +define i32 @max_i32_backedgetaken() { ; CHECK-LABEL: @max_i32_backedgetaken( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], splat (i32 4) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]]) -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[B_0:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A_0_AND]] = and i32 [[A_0]], 4 ; CHECK-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ] ; CHECK-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; IND-LABEL: @max_i32_backedgetaken( ; IND-NEXT: entry: -; IND-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; IND: vector.ph: -; IND-NEXT: br label [[VECTOR_BODY:%.*]] -; IND: vector.body: -; IND-NEXT: br i1 poison, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; IND: middle.block: -; IND-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; IND: scalar.ph: ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: -; IND-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; IND-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; IND-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; IND-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; IND-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; IND-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; IND: exit: ; IND-NEXT: ret i32 0 ; ; UNROLL-LABEL: @max_i32_backedgetaken( ; UNROLL-NEXT: entry: -; UNROLL-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; UNROLL: vector.ph: -; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] -; UNROLL: vector.body: -; UNROLL-NEXT: br i1 poison, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; UNROLL: middle.block: -; UNROLL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; UNROLL: scalar.ph: ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: -; UNROLL-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; UNROLL-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; UNROLL-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; UNROLL: exit: ; UNROLL-NEXT: ret i32 0 ; ; UNROLL-NO-IC-LABEL: @max_i32_backedgetaken( ; UNROLL-NO-IC-NEXT: entry: -; UNROLL-NO-IC-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; UNROLL-NO-IC: vector.ph: -; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] -; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[TMP0]] = and <2 x i32> [[VEC_PHI]], splat (i32 4) -; UNROLL-NO-IC-NEXT: [[TMP1]] = and <2 x i32> [[VEC_PHI1]], splat (i32 4) -; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 -; UNROLL-NO-IC-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = and <2 x i32> [[TMP1]], [[TMP0]] -; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) -; UNROLL-NO-IC-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: -; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[B_0:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[A_0_AND]] = and i32 [[A_0]], 4 ; UNROLL-NO-IC-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; UNROLL-NO-IC-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; UNROLL-NO-IC-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; UNROLL-NO-IC: exit: -; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-IC-NEXT: [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[A_0_AND_LCSSA]] ; ; INTERLEAVE-LABEL: @max_i32_backedgetaken( ; INTERLEAVE-NEXT: entry: -; INTERLEAVE-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; INTERLEAVE: vector.ph: -; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] -; INTERLEAVE: vector.body: -; INTERLEAVE-NEXT: br i1 poison, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] -; INTERLEAVE: middle.block: -; INTERLEAVE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] -; INTERLEAVE: scalar.ph: ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: -; INTERLEAVE-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; INTERLEAVE-NEXT: [[B_0:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; INTERLEAVE-NEXT: [[B_NEXT]] = add i32 [[B_0]], -1 ; INTERLEAVE-NEXT: [[EC:%.*]] = icmp eq i32 [[B_NEXT]], 0 -; INTERLEAVE-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] +; INTERLEAVE-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] ; INTERLEAVE: exit: ; INTERLEAVE-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll index 5bc832fbd6842..c3164762e8130 100644 --- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll +++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll @@ -317,7 +317,7 @@ define void @scalarize_ptrtoint(ptr %src, ptr %dst) { ; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: store ptr [[TMP11]], ptr %dst, align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP12]], label %middle.block, label %vector.body entry: @@ -332,7 +332,7 @@ loop: %cast.2 = inttoptr i64 %add to ptr store ptr %cast.2, ptr %dst, align 8 %iv.next = add i64 %iv, 1 - %ec = icmp eq i64 %iv.next, 0 + %ec = icmp eq i64 %iv.next, 1024 br i1 %ec, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll index e5e7a1c748086..186470a1e8b78 100644 --- a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll +++ b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll @@ -4,7 +4,7 @@ define void @d() { ; CHECK-LABEL: define void @d() { ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -19,12 +19,12 @@ define void @d() { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I7:%.*]], [[LOOP]] ] @@ -34,7 +34,7 @@ define void @d() { ; CHECK-NEXT: [[I6:%.*]] = select i1 [[I5]], float 0.000000e+00, float 0.000000e+00 ; CHECK-NEXT: store float [[I6]], ptr [[I4]], align 4 ; CHECK-NEXT: [[I7]] = add i64 [[I]], 1 -; CHECK-NEXT: [[I8:%.*]] = icmp eq i64 [[I7]], 0 +; CHECK-NEXT: [[I8:%.*]] = icmp eq i64 [[I7]], 128 ; CHECK-NEXT: br i1 [[I8]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -50,7 +50,7 @@ loop: %i6 = select i1 %i5, float 0.0, float 0.0 store float %i6, ptr %i4, align 4 %i7 = add i64 %i, 1 - %i8 = icmp eq i64 %i7, 0 + %i8 = icmp eq i64 %i7, 128 br i1 %i8, label %exit, label %loop exit: diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll index 1bfb34165e52e..065e38e9fa5cf 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -821,3 +821,122 @@ loop: exit: ret void } + +define void @multiple_ivs_wide(ptr %dst) { +; CHECK-LABEL: @multiple_ivs_wide( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 6 +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 2) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2 +; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3 +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8) +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 64 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 64, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 64, [[VEC_EPILOG_ITER_CHECK]] ], [ -64, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX1]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP19:%.*]] = add <4 x i32> [[VEC_IND2]], splat (i32 2) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP24]], ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP25]], ptr [[TMP21]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2 +; CHECK-NEXT: store i32 [[TMP26]], ptr [[TMP22]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3 +; CHECK-NEXT: store i32 [[TMP27]], ptr [[TMP23]], align 4 +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], splat (i32 8) +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[INDEX_NEXT4]], 64 +; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ 128, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 128, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ 64, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ -64, [[ITER_CHECK]] ], [ 64, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 2 +; CHECK-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 2 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]] +; CHECK-NEXT: store i32 [[IV_2_NEXT]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 128 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +; CHECK-PROFITABLE-BY-DEFAULT-LABEL: @multiple_ivs_wide( +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: entry: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[LOOP:%.*]] +; CHECK-PROFITABLE-BY-DEFAULT: loop: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2:%.*]] = phi i32 [ -64, [[ENTRY]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_NEXT]] = add i32 [[IV]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[IV]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store i32 [[IV_2_NEXT]], ptr [[GEP]], align 4 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 128 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-PROFITABLE-BY-DEFAULT: exit: +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %iv.2 = phi i32 [ -64, %entry ], [ %iv.2.next, %loop ] + %iv.next = add i32 %iv, 2 + %iv.2.next = add i32 %iv.2, 2 + %gep = getelementptr inbounds i32, ptr %dst, i32 %iv + store i32 %iv.2.next, ptr %gep, align 4 + %ec = icmp eq i32 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: ; preds = %loop + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll index 468b3ca337d7b..5f0a0c5d69a42 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll @@ -1528,19 +1528,17 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { ; CHECK-NEXT: [[PRE_C:%.*]] = icmp ugt i32 [[ACOLS]], 0 ; CHECK-NEXT: br i1 [[PRE_C]], label [[EXIT:%.*]], label [[OUTER_HEADER_PREHEADER:%.*]] ; CHECK: outer.header.preheader: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 8 -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[C]], i64 34359738368 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[C]], i64 8000 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 8 ; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] ; CHECK: outer.header: ; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[OUTER_HEADER_PREHEADER]] ] ; CHECK-NEXT: [[MUL_US:%.*]] = mul i32 [[OUTER_IV]], [[ACOLS]] ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr double, ptr [[A]], i32 [[MUL_US]] -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; CHECK: vector.scevcheck: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -1549,18 +1547,18 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]], !noalias [[META72:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 -; CHECK-NEXT: store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8, !alias.scope [[META72]] +; CHECK-NEXT: store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[OUTER_LATCH]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[OUTER_HEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_HEADER]] ] ; CHECK-NEXT: br label [[INNER:%.*]] ; CHECK: inner: ; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ] @@ -1568,7 +1566,7 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { ; CHECK-NEXT: [[L:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8 ; CHECK-NEXT: store double [[L]], ptr [[GEP_C]], align 8 ; CHECK-NEXT: [[INNER_IV_NEXT]] = add i32 [[INNER_IV]], 1 -; CHECK-NEXT: [[INNER_C:%.*]] = icmp eq i32 [[INNER_IV_NEXT]], 0 +; CHECK-NEXT: [[INNER_C:%.*]] = icmp eq i32 [[INNER_IV_NEXT]], 1000 ; CHECK-NEXT: br i1 [[INNER_C]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP75:![0-9]+]] ; CHECK: outer.latch: ; CHECK-NEXT: [[OUTER_IV_NEXT]] = add i32 [[OUTER_IV]], 1 @@ -1595,7 +1593,7 @@ inner: %l = load double, ptr %arrayidx.us, align 8 store double %l, ptr %gep.C, align 8 %inner.iv.next = add i32 %inner.iv, 1 - %inner.c = icmp eq i32 %inner.iv.next, 0 + %inner.c = icmp eq i32 %inner.iv.next, 1000 br i1 %inner.c, label %outer.latch, label %inner outer.latch: diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll index 2f2d715790229..20053cd8661d1 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll @@ -6,12 +6,12 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK-SAME: (i32 [[ARG:%.*]], ptr [[DST:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADD:%.*]] = add i32 [[ARG]], 1 -; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP0:%.*]] = sub i32 -1, [[ARG]] ; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[ADD]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[TMP0]], i32 [[ADD]] -; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP2]], i32 -1) +; CHECK-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP2]], i32 1023) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 ; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = sub i32 0, [[MUL_RESULT]] @@ -19,8 +19,7 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP1]], i1 [[TMP4]], i1 false ; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[ADD]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer @@ -28,28 +27,28 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i32> [[TMP9]] to <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP12]], align 8 -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP14]], align 8 -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP16]], align 8 -; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP18]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] +; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP16]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP11]], align 8 +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP13]], align 8 +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP15]], align 8 +; CHECK-NEXT: store double 0.000000e+00, ptr [[TMP17]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[LOOP]] ] @@ -59,7 +58,7 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr double, ptr [[DST]], i64 [[ZEXT]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[GETELEMENTPTR]], align 8 ; CHECK-NEXT: [[ADD2]] = add i64 [[PHI]], 1 -; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i64 [[ADD2]], 0 +; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i64 [[ADD2]], 1024 ; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -76,7 +75,7 @@ loop: %getelementptr = getelementptr double, ptr %dst, i64 %zext store double 0.000000e+00, ptr %getelementptr, align 8 %add2 = add i64 %phi, 1 - %icmp = icmp eq i64 %add2, 0 + %icmp = icmp eq i64 %add2, 1024 br i1 %icmp, label %exit, label %loop exit: @@ -89,36 +88,34 @@ define void @integer_induction_wraps_scev_predicate_known(i32 %x, ptr %call, ptr ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = shl i32 [[X]], 1 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[MUL]] to i64 -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; CHECK: vector.scevcheck: -; CHECK-NEXT: br i1 true, label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 4294967264, [[TMP0]] -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 992, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i64> , [[DOTSPLAT]] -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i64> , [[DOTSPLAT]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> [[TMP4]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 30, [[DOTCAST]] -; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr ptr, ptr [[TMP5]], i32 0 -; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr ptr, ptr [[TMP6]], i32 0 +; CHECK-NEXT: store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967264 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ -2, [[MIDDLE_BLOCK]] ], [ 30, [[VECTOR_SCEVCHECK]] ], [ 30, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_SCEVCHECK]] ], [ [[START]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1022, [[MIDDLE_BLOCK]] ], [ 30, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_COND]] ] @@ -127,7 +124,7 @@ define void @integer_induction_wraps_scev_predicate_known(i32 %x, ptr %call, ptr ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr ptr, ptr [[CALL]], i32 [[IV]] ; CHECK-NEXT: store ptr [[P_0]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV]], 0 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV]], 1024 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void @@ -143,7 +140,7 @@ for.cond: ; preds = %for.body, %entry %arrayidx = getelementptr ptr, ptr %call, i32 %iv store ptr %p.0, ptr %arrayidx, align 4 %inc = add i32 %iv, 1 - %tobool.not = icmp eq i32 %iv, 0 + %tobool.not = icmp eq i32 %iv, 1024 br i1 %tobool.not, label %for.end, label %for.cond for.end: ; preds = %for.cond diff --git a/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll b/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll new file mode 100644 index 0000000000000..3a0bb2ac1d9ee --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/select-with-fastflags.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 + +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s + +define void @select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; CHECK-LABEL: define void @select_with_fastmath_flags( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fcmp fast ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+01) +; CHECK-NEXT: [[TMP7:%.*]] = select fast <4 x i1> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[GEP]], align 4 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds nuw float, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[GEP3]], align 4 +; CHECK-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP11]], 1.000000e+01 +; CHECK-NEXT: [[COND:%.*]] = select fast i1 [[CMP4]], float [[ADD]], float [[TMP12]] +; CHECK-NEXT: [[GEP11:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: store float [[COND]], ptr [[GEP11]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %gep = getelementptr inbounds nuw float, ptr %b, i64 %iv + %0 = load float, ptr %gep, align 4 + %gep3 = getelementptr inbounds nuw float, ptr %c, i64 %iv + %1 = load float, ptr %gep3, align 4 + %cmp4 = fcmp fast ogt float %0, %1 + %add = fadd fast float %0, 1.000000e+01 + %cond = select fast i1 %cmp4, float %add, float %1 + %gep11 = getelementptr inbounds nuw float, ptr %a, i64 %iv + store float %cond, ptr %gep11, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 5c09ce22cc8fb..00d8de67a3b40 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -1200,6 +1200,62 @@ exit: ret i16 %for.1 } +define void @print_select_with_fastmath_flags(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { +; CHECK-LABEL: 'print_select_with_fastmath_flags' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' +; CHECK-NEXT: Live-in vp<[[VFUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%N> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT_EXIT:%.+]]> +; CHECK-NEXT: vp<[[ST:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds nuw ir<%b>, vp<[[ST]]> +; CHECK-NEXT: vp<[[PTR1:%.+]]> = vector-pointer ir<[[GEP1]]> +; CHECK-NEXT: WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]> +; CHECK-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds nuw ir<%c>, vp<[[ST]]> +; CHECK-NEXT: vp<[[PTR2:%.+]]> = vector-pointer ir<[[GEP2]]> +; CHECK-NEXT: WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]> +; CHECK-NEXT: WIDEN ir<[[FCMP:%.+]]> = fcmp ogt ir<[[LD1]]>, ir<[[LD2]]> +; CHECK-NEXT: WIDEN ir<[[FADD:%.+]]> = fadd reassoc nnan ninf nsz arcp contract afn ir<[[LD1]]>, ir<1.000000e+01> +; CHECK-NEXT: WIDEN-SELECT ir<[[SELECT:%.+]]> = select reassoc nnan ninf nsz arcp contract afn ir<[[FCMP]]>, ir<[[FADD]]>, ir<[[LD2]]> +; CHECK-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds nuw ir<%a>, vp<[[ST]]> +; CHECK-NEXT: vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]> +; CHECK-NEXT: WIDEN store vp<[[PTR3]]>, ir<[[SELECT]]> +; CHECK-NEXT: EMIT vp<[[IV_NEXT_EXIT]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %gep = getelementptr inbounds nuw float, ptr %b, i64 %iv + %0 = load float, ptr %gep, align 4 + %gep3 = getelementptr inbounds nuw float, ptr %c, i64 %iv + %1 = load float, ptr %gep3, align 4 + %cmp4 = fcmp fast ogt float %0, %1 + %add = fadd fast float %0, 1.000000e+01 + %cond = select fast i1 %cmp4, float %add, float %1 + %gep11 = getelementptr inbounds nuw float, ptr %a, i64 %iv + store float %cond, ptr %gep11, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll index b6391e0457697..b58a14952f7a8 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll @@ -10,13 +10,13 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) { ; CHECK: VPlan 'Initial VPlan for VF={1},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: ir<0> = original trip-count +; CHECK-NEXT: ir<1024> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: Successor(s): vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<0> + vp<[[VEC_TC]]> * ir<-1> +; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<1024> + vp<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { @@ -49,18 +49,18 @@ define void @sink_with_sideeffects(i1 %c, ptr %ptr) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]> ; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: ; CHECK-NEXT: EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[VEC_TC]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END]]>, ir<1024> ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: IR %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME1]]> from scalar.ph) -; CHECK-NEXT: IR %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph) +; CHECK-NEXT: IR %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1024, %entry ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph) ; CHECK: IR %tmp5 = trunc i32 %tmp4 to i8 ; CHECK-NEXT: No successors ; CHECK-EMPTY: @@ -73,7 +73,7 @@ entry: for.body: %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] - %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 0, %entry ] + %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1024, %entry ] %tmp2 = getelementptr i8, ptr %ptr, i64 %tmp0 %tmp3 = load i8, ptr %tmp2, align 1 store i8 0, ptr %tmp2 diff --git a/llvm/test/tools/llvm-exegesis/X86/dry-run-measurement.test b/llvm/test/tools/llvm-exegesis/X86/dry-run-measurement.test deleted file mode 100644 index cf13fd04261ca..0000000000000 --- a/llvm/test/tools/llvm-exegesis/X86/dry-run-measurement.test +++ /dev/null @@ -1,10 +0,0 @@ -# RUN: llvm-exegesis --mode=latency --opcode-name=LEA64r --use-dummy-perf-counters --benchmark-phase=dry-run-measurement | FileCheck %s -# REQUIRES: exegesis-can-execute-x86_64 - -# This test makes sure that llvm-exegesis doesn't execute any snippet in the presence of dry-run measurement. - -# Should not contain misleading results. -# CHECK: measurements: [] - -# Should not contain any error message. -# CHECK: error: '' diff --git a/llvm/test/tools/llvm-exegesis/lit.local.cfg b/llvm/test/tools/llvm-exegesis/lit.local.cfg index 343f34c58673e..a51a2d73442fa 100644 --- a/llvm/test/tools/llvm-exegesis/lit.local.cfg +++ b/llvm/test/tools/llvm-exegesis/lit.local.cfg @@ -30,12 +30,6 @@ def can_use_perf_counters(mode, extra_options=[]): print("could not exec llvm-exegesis") return False -# LLJIT builds its own TargetMachine using arch designated by LLVM_TARGET_ARCH, which -# is default to host. We don't want tests that use LLJIT (but not necessarily -# execute the snippets) to run on machines that are not even supported by -# exegesis. -if config.root.native_target in ["AArch64", "Mips", "PowerPC", "RISCV", "X86"]: - config.available_features.add("native-registered-exegesis-target") for arch in ["aarch64", "mips", "powerpc", "x86_64"]: if can_execute_generated_snippets(arch): diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h index 5480d85616878..3c09a8380146e 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h @@ -38,7 +38,6 @@ enum class BenchmarkPhaseSelectorE { PrepareAndAssembleSnippet, AssembleMeasuredCode, Measure, - DryRunMeasure, }; enum class BenchmarkFilter { All, RegOnly, WithMem }; diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index cc46f7feb6cf7..a7771b99e97b1 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -99,7 +99,7 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { static Expected> create(const LLVMState &State, object::OwningBinary Obj, BenchmarkRunner::ScratchSpace *Scratch, - std::optional BenchmarkProcessCPU, bool DryRun) { + std::optional BenchmarkProcessCPU) { Expected EF = ExecutableFunction::create(State.createTargetMachine(), std::move(Obj)); @@ -107,17 +107,14 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { return EF.takeError(); return std::unique_ptr( - new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch, - DryRun)); + new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch)); } private: InProcessFunctionExecutorImpl(const LLVMState &State, ExecutableFunction Function, - BenchmarkRunner::ScratchSpace *Scratch, - bool DryRun) - : State(State), Function(std::move(Function)), Scratch(Scratch), - DryRun(DryRun) {} + BenchmarkRunner::ScratchSpace *Scratch) + : State(State), Function(std::move(Function)), Scratch(Scratch) {} static void accumulateCounterValues(const SmallVector &NewValues, SmallVector *Result) { @@ -146,14 +143,9 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { CrashRecoveryContext CRC; CrashRecoveryContext::Enable(); const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() { - if (DryRun) { - Counter->start(); - Counter->stop(); - } else { - Counter->start(); - this->Function(ScratchPtr); - Counter->stop(); - } + Counter->start(); + this->Function(ScratchPtr); + Counter->stop(); }); CrashRecoveryContext::Disable(); PS.reset(); @@ -185,7 +177,6 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor { const LLVMState &State; const ExecutableFunction Function; BenchmarkRunner::ScratchSpace *const Scratch; - bool DryRun = false; }; #ifdef __linux__ @@ -673,9 +664,6 @@ Expected> BenchmarkRunner::createFunctionExecutor( object::OwningBinary ObjectFile, const BenchmarkKey &Key, std::optional BenchmarkProcessCPU) const { - bool DryRun = - BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::DryRunMeasure; - switch (ExecutionMode) { case ExecutionModeE::InProcess: { if (BenchmarkProcessCPU.has_value()) @@ -683,8 +671,7 @@ BenchmarkRunner::createFunctionExecutor( "support benchmark core pinning."); auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create( - State, std::move(ObjectFile), Scratch.get(), BenchmarkProcessCPU, - DryRun); + State, std::move(ObjectFile), Scratch.get(), BenchmarkProcessCPU); if (!InProcessExecutorOrErr) return InProcessExecutorOrErr.takeError(); @@ -692,10 +679,6 @@ BenchmarkRunner::createFunctionExecutor( } case ExecutionModeE::SubProcess: { #ifdef __linux__ - if (DryRun) - return make_error("The subprocess execution mode cannot " - "dry-run measurement at this moment."); - auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create( State, std::move(ObjectFile), Key, BenchmarkProcessCPU); if (!SubProcessExecutorOrErr) diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp index 5636782bdf7f6..217b423d7b3f3 100644 --- a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp @@ -32,13 +32,11 @@ namespace { static std::vector loadIntReg(const MCSubtargetInfo &STI, unsigned Reg, const APInt &Value) { SmallVector MCInstSeq; - std::vector MatIntInstrs; MCRegister DestReg = Reg; RISCVMatInt::generateMCInstSeq(Value.getSExtValue(), STI, DestReg, MCInstSeq); - MatIntInstrs.resize(MCInstSeq.size()); - std::copy(MCInstSeq.begin(), MCInstSeq.end(), MatIntInstrs.begin()); + std::vector MatIntInstrs(MCInstSeq.begin(), MCInstSeq.end()); return MatIntInstrs; } @@ -124,6 +122,10 @@ class ExegesisRISCVTarget : public ExegesisTarget { ArrayRef getUnavailableRegisters() const override; + bool allowAsBackToBack(const Instruction &Instr) const override { + return !Instr.Description.isPseudo(); + } + Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue, const BitVector &ForbiddenRegs) const override; diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp index e2251ff978888..29e58692f0e92 100644 --- a/llvm/tools/llvm-exegesis/lib/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/Target.cpp @@ -98,7 +98,7 @@ ExegesisTarget::createBenchmarkRunner( return nullptr; case Benchmark::Latency: case Benchmark::InverseThroughput: - if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure && + if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure && !PfmCounters.CycleCounter) { const char *ModeName = Mode == Benchmark::Latency ? "latency" @@ -116,7 +116,7 @@ ExegesisTarget::createBenchmarkRunner( State, Mode, BenchmarkPhaseSelector, ResultAggMode, ExecutionMode, ValidationCounters, BenchmarkRepeatCount); case Benchmark::Uops: - if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure && + if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure && !PfmCounters.UopsCounter && !PfmCounters.IssueCounters) return make_error( "can't run 'uops' mode, sched model does not define uops or issue " diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp index 07bd44ee64f1f..fa37e05956be8 100644 --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -132,10 +132,7 @@ static cl::opt BenchmarkPhaseSelector( clEnumValN( BenchmarkPhaseSelectorE::Measure, "measure", "Same as prepare-measured-code, but also runs the measurement " - "(default)"), - clEnumValN( - BenchmarkPhaseSelectorE::DryRunMeasure, "dry-run-measurement", - "Same as measure, but does not actually execute the snippet")), + "(default)")), cl::init(BenchmarkPhaseSelectorE::Measure)); static cl::opt @@ -479,7 +476,7 @@ static void runBenchmarkConfigurations( } void benchmarkMain() { - if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure && + if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure && !UseDummyPerfCounters) { #ifndef HAVE_LIBPFM ExitWithError( @@ -504,7 +501,7 @@ void benchmarkMain() { // Preliminary check to ensure features needed for requested // benchmark mode are present on target CPU and/or OS. - if (BenchmarkPhaseSelector >= BenchmarkPhaseSelectorE::Measure) + if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure) ExitOnErr(State.getExegesisTarget().checkFeatureSupport()); if (ExecutionMode == BenchmarkRunner::ExecutionModeE::SubProcess && diff --git a/llvm/tools/spirv-tools/CMakeLists.txt b/llvm/tools/spirv-tools/CMakeLists.txt index 57dfe3310c459..a47c1f894d7a4 100644 --- a/llvm/tools/spirv-tools/CMakeLists.txt +++ b/llvm/tools/spirv-tools/CMakeLists.txt @@ -66,7 +66,7 @@ endif () if (SPIRV_AS) add_custom_target(spirv-as - COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_VAL}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-as") + COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${SPIRV_AS}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-as") else () add_custom_target(spirv-as COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${BINARY_DIR}/tools/spirv-as${CMAKE_EXECUTABLE_SUFFIX}" "${LLVM_RUNTIME_OUTPUT_INTDIR}/spirv-as${CMAKE_EXECUTABLE_SUFFIX}" diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp index 11a379c7e5024..ff6cf49bb9758 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp +++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp @@ -217,6 +217,51 @@ TEST(LinkGraphTest, ContentAccessAndUpdate) { [](char C) { return C == 0; })); } +TEST(LinkGraphTest, FindSymbolsByName) { + // Check that we can make defined and absolute symbols external. + LinkGraph G("foo", std::make_shared(), + Triple("x86_64-apple-darwin"), SubtargetFeatures(), + getGenericEdgeKindName); + auto &Sec = + G.createSection("__data", orc::MemProt::Read | orc::MemProt::Write); + + auto &B1 = + G.createContentBlock(Sec, BlockContent, orc::ExecutorAddr(0x1000), 8, 0); + + // Add an anonymous symbol to make sure that these don't disrupt by-name + // lookup of defined symbols. + G.addAnonymousSymbol(B1, 0, 0, false, false); + + // Add named defined, external and absolute symbols. + auto Foo = G.intern("foo"); + auto &FooSym = G.addDefinedSymbol(B1, 0, Foo, 4, Linkage::Strong, + Scope::Default, false, false); + + auto Bar = G.intern("bar"); + auto &BarSym = G.addExternalSymbol(Bar, 0, false); + + auto Baz = G.intern("baz"); + auto &BazSym = G.addAbsoluteSymbol(Baz, orc::ExecutorAddr(0x1234), 0, + Linkage::Strong, Scope::Default, true); + + EXPECT_EQ(G.findDefinedSymbolByName(Foo), &FooSym); + EXPECT_EQ(G.findExternalSymbolByName(Foo), nullptr); + EXPECT_EQ(G.findAbsoluteSymbolByName(Foo), nullptr); + + EXPECT_EQ(G.findDefinedSymbolByName(Bar), nullptr); + EXPECT_EQ(G.findExternalSymbolByName(Bar), &BarSym); + EXPECT_EQ(G.findAbsoluteSymbolByName(Bar), nullptr); + + EXPECT_EQ(G.findDefinedSymbolByName(Baz), nullptr); + EXPECT_EQ(G.findExternalSymbolByName(Baz), nullptr); + EXPECT_EQ(G.findAbsoluteSymbolByName(Baz), &BazSym); + + auto Qux = G.intern("qux"); + EXPECT_EQ(G.findDefinedSymbolByName(Qux), nullptr); + EXPECT_EQ(G.findExternalSymbolByName(Qux), nullptr); + EXPECT_EQ(G.findAbsoluteSymbolByName(Qux), nullptr); +} + TEST(LinkGraphTest, MakeExternal) { // Check that we can make defined and absolute symbols external. LinkGraph G("foo", std::make_shared(), diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 3b571cce09a4f..a7b513bdfdc66 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -6243,8 +6243,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) { OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(), Builder.saveIP(), EntryInfo, DefaultAttrs, - RuntimeAttrs, Inputs, GenMapInfoCB, BodyGenCB, - SimpleArgAccessorCB)); + RuntimeAttrs, /*IfCond=*/nullptr, Inputs, + GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -6402,11 +6402,12 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) { /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0}; - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createTarget( - Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, - EntryInfo, DefaultAttrs, RuntimeAttrs, CapturedArgs, - GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, AfterIP, + OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, + EntryInfo, DefaultAttrs, RuntimeAttrs, + /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, + BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); @@ -6561,8 +6562,8 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) { OpenMPIRBuilder::InsertPointTy, AfterIP, OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(), Builder.saveIP(), EntryInfo, DefaultAttrs, - RuntimeAttrs, Inputs, GenMapInfoCB, BodyGenCB, - SimpleArgAccessorCB)); + RuntimeAttrs, /*IfCond=*/nullptr, Inputs, + GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); OMPBuilder.finalize(); @@ -6660,11 +6661,12 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) { /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0}; - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createTarget( - Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, - EntryInfo, DefaultAttrs, RuntimeAttrs, CapturedArgs, - GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, AfterIP, + OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, + EntryInfo, DefaultAttrs, RuntimeAttrs, + /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, + BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); @@ -6774,11 +6776,12 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) { /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC, /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0}; - ASSERT_EXPECTED_INIT(OpenMPIRBuilder::InsertPointTy, AfterIP, - OMPBuilder.createTarget( - Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, - EntryInfo, DefaultAttrs, RuntimeAttrs, CapturedArgs, - GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB)); + ASSERT_EXPECTED_INIT( + OpenMPIRBuilder::InsertPointTy, AfterIP, + OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP, + EntryInfo, DefaultAttrs, RuntimeAttrs, + /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB, + BodyGenCB, SimpleArgAccessorCB)); Builder.restoreIP(AfterIP); Builder.CreateRetVoid(); diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp index 7fb7625f8c2d1..61b3637bb48e2 100644 --- a/llvm/unittests/TargetParser/TripleTest.cpp +++ b/llvm/unittests/TargetParser/TripleTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/TargetParser/Triple.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/VersionTuple.h" #include "gtest/gtest.h" @@ -1416,6 +1417,132 @@ TEST(TripleTest, Normalization) { EXPECT_EQ("x86_64-unknown-linux-gnu", Triple::normalize("x86_64-gnu-linux")); + EXPECT_EQ("a-unknown-unknown", + Triple::normalize("a", Triple::CanonicalForm::THREE_IDENT)); + EXPECT_EQ("a-b-unknown", + Triple::normalize("a-b", Triple::CanonicalForm::THREE_IDENT)); + EXPECT_EQ("a-b-c", + Triple::normalize("a-b-c", Triple::CanonicalForm::THREE_IDENT)); + EXPECT_EQ("a-b-c", + Triple::normalize("a-b-c-d", Triple::CanonicalForm::THREE_IDENT)); + EXPECT_EQ("a-b-c", + Triple::normalize("a-b-c-d-e", Triple::CanonicalForm::THREE_IDENT)); + + EXPECT_EQ("a-unknown-unknown-unknown", + Triple::normalize("a", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-b-unknown-unknown", + Triple::normalize("a-b", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-b-c-unknown", + Triple::normalize("a-b-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-b-c-d", + Triple::normalize("a-b-c-d", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-b-c-d", + Triple::normalize("a-b-c-d-e", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("a-unknown-unknown-unknown-unknown", + Triple::normalize("a", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-b-unknown-unknown-unknown", + Triple::normalize("a-b", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-b-c-unknown-unknown", + Triple::normalize("a-b-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-b-c-d-unknown", + Triple::normalize("a-b-c-d", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-b-c-d-e", + Triple::normalize("a-b-c-d-e", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-b-c-unknown", + Triple::normalize("i386-b-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("i386-b-c-unknown-unknown", + Triple::normalize("i386-b-c", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-a-c-unknown", + Triple::normalize("a-i386-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("i386-a-c-unknown-unknown", + Triple::normalize("a-i386-c", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-a-b-unknown", + Triple::normalize("a-b-i386", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("i386-a-b-c", + Triple::normalize("a-b-c-i386", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("a-pc-c-unknown", + Triple::normalize("a-pc-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-pc-b-c", + Triple::normalize("pc-b-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-pc-b-unknown", + Triple::normalize("a-b-pc", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-pc-b-c", + Triple::normalize("a-b-c-pc", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("a-b-linux-unknown", + Triple::normalize("a-b-linux", Triple::CanonicalForm::FOUR_IDENT)); + // We lose `-c` here as expected. + EXPECT_EQ("unknown-unknown-linux-b", + Triple::normalize("linux-b-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("a-unknown-linux-c", + Triple::normalize("a-linux-c", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("i386-pc-a-unknown", + Triple::normalize("a-pc-i386", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("i386-pc-unknown-unknown", + Triple::normalize("-pc-i386", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-pc-linux-c", + Triple::normalize("linux-pc-c", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-pc-linux-unknown", + Triple::normalize("linux-pc-", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("i386-unknown-unknown-unknown", + Triple::normalize("i386", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-pc-unknown-unknown", + Triple::normalize("pc", Triple::CanonicalForm::FOUR_IDENT)); + EXPECT_EQ("unknown-unknown-linux-unknown", + Triple::normalize("linux", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ( + "x86_64-unknown-linux-gnu", + Triple::normalize("x86_64-gnu-linux", Triple::CanonicalForm::FOUR_IDENT)); + + EXPECT_EQ("i386-a-b-unknown-unknown", + Triple::normalize("a-b-i386", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("i386-a-b-c-unknown", + Triple::normalize("a-b-c-i386", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("a-pc-c-unknown-unknown", + Triple::normalize("a-pc-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-pc-b-c-unknown", + Triple::normalize("pc-b-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-pc-b-unknown-unknown", + Triple::normalize("a-b-pc", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-pc-b-c-unknown", + Triple::normalize("a-b-c-pc", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("a-b-linux-unknown-unknown", + Triple::normalize("a-b-linux", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-unknown-linux-b-c", + Triple::normalize("linux-b-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("a-unknown-linux-c-unknown", + Triple::normalize("a-linux-c", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-pc-a-unknown-unknown", + Triple::normalize("a-pc-i386", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("i386-pc-unknown-unknown-unknown", + Triple::normalize("-pc-i386", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-pc-linux-c-unknown", + Triple::normalize("linux-pc-c", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-pc-linux-unknown-unknown", + Triple::normalize("linux-pc-", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ("i386-unknown-unknown-unknown-unknown", + Triple::normalize("i386", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-pc-unknown-unknown-unknown", + Triple::normalize("pc", Triple::CanonicalForm::FIVE_IDENT)); + EXPECT_EQ("unknown-unknown-linux-unknown-unknown", + Triple::normalize("linux", Triple::CanonicalForm::FIVE_IDENT)); + + EXPECT_EQ( + "x86_64-unknown-linux-gnu-unknown", + Triple::normalize("x86_64-gnu-linux", Triple::CanonicalForm::FIVE_IDENT)); + // Check that normalizing a permutated set of valid components returns a // triple with the unpermuted components. // @@ -2737,4 +2864,41 @@ TEST(TripleTest, DXILNormaizeWithVersion) { EXPECT_EQ("dxilv1.0-pc-shadermodel5.0-compute", Triple::normalize("dxil-shadermodel5.0-pc-compute")); } + +TEST(TripleTest, isCompatibleWith) { + struct { + const char *A; + const char *B; + bool Result; + } Cases[] = { + {"armv7-linux-gnueabihf", "thumbv7-linux-gnueabihf", true}, + {"armv4-none-unknown-eabi", "thumbv6-unknown-linux-gnueabihf", false}, + {"x86_64-apple-macosx10.9.0", "x86_64-apple-macosx10.10.0", true}, + {"x86_64-apple-macosx10.9.0", "i386-apple-macosx10.9.0", false}, + {"x86_64-apple-macosx10.9.0", "x86_64h-apple-macosx10.9.0", true}, + {"x86_64-unknown-linux-gnu", "x86_64-unknown-linux-gnu", true}, + {"x86_64-unknown-linux-gnu", "i386-unknown-linux-gnu", false}, + {"x86_64-unknown-linux-gnu", "x86_64h-unknown-linux-gnu", true}, + {"x86_64-pc-windows-gnu", "x86_64-pc-windows-msvc", false}, + {"x86_64-pc-windows-msvc", "x86_64-pc-windows-msvc-elf", false}, + {"i686-w64-windows-gnu", "i386-w64-windows-gnu", true}, + {"x86_64-w64-windows-gnu", "x86_64-pc-windows-gnu", true}, + {"armv7-w64-windows-gnu", "thumbv7-pc-windows-gnu", true}, + }; + + auto DoTest = [](const char *A, const char *B, + bool Result) -> testing::AssertionResult { + if (Triple(A).isCompatibleWith(Triple(B)) != Result) { + return testing::AssertionFailure() + << llvm::formatv("Triple {0} and {1} were expected to be {2}", A, + B, Result ? "compatible" : "incompatible"); + } + return testing::AssertionSuccess(); + }; + for (const auto &C : Cases) { + EXPECT_TRUE(DoTest(C.A, C.B, C.Result)); + // Test that the comparison is commutative. + EXPECT_TRUE(DoTest(C.B, C.A, C.Result)); + } +} } // end anonymous namespace diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp index 149ba7a1d9032..bc300c3461100 100644 --- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp @@ -2645,8 +2645,8 @@ GICombinerEmitter::buildMatchTable(MutableArrayRef Rules) { for (RuleMatcher &Rule : Rules) { const StringRef Opcode = Rule.getOpcode(); assert(!Opcode.empty() && "Didn't expect an undefined opcode"); - if (OpcodeOrder.count(Opcode) == 0) - OpcodeOrder[Opcode] = CurrentOrdering++; + if (OpcodeOrder.try_emplace(Opcode, CurrentOrdering).second) + ++CurrentOrdering; } llvm::stable_sort(InputRules, [&OpcodeOrder](const Matcher *A, diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 3b334ea4ce152..04ebdbb0ffc90 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -2216,8 +2216,8 @@ GlobalISelEmitter::buildMatchTable(MutableArrayRef Rules, for (RuleMatcher &Rule : Rules) { const StringRef Opcode = Rule.getOpcode(); assert(!Opcode.empty() && "Didn't expect an undefined opcode"); - if (OpcodeOrder.count(Opcode) == 0) - OpcodeOrder[Opcode] = CurrentOrdering++; + if (OpcodeOrder.try_emplace(Opcode, CurrentOrdering).second) + ++CurrentOrdering; } llvm::stable_sort( diff --git a/mlir/include/mlir/IR/BuiltinTypeInterfaces.h b/mlir/include/mlir/IR/BuiltinTypeInterfaces.h index ed5e5ca22c595..e8011b5488dc9 100644 --- a/mlir/include/mlir/IR/BuiltinTypeInterfaces.h +++ b/mlir/include/mlir/IR/BuiltinTypeInterfaces.h @@ -11,6 +11,15 @@ #include "mlir/IR/Types.h" +namespace llvm { +struct fltSemantics; +} // namespace llvm + +namespace mlir { +class FloatType; +class MLIRContext; +} // namespace mlir + #include "mlir/IR/BuiltinTypeInterfaces.h.inc" #endif // MLIR_IR_BUILTINTYPEINTERFACES_H diff --git a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td index c9dcd546cf67c..c36b738e38f42 100644 --- a/mlir/include/mlir/IR/BuiltinTypeInterfaces.td +++ b/mlir/include/mlir/IR/BuiltinTypeInterfaces.td @@ -16,6 +16,65 @@ include "mlir/IR/OpBase.td" +def FloatTypeInterface : TypeInterface<"FloatType"> { + let cppNamespace = "::mlir"; + let description = [{ + This type interface should be implemented by all floating-point types. It + defines the LLVM APFloat semantics and provides a few helper functions. + }]; + + let methods = [ + InterfaceMethod< + /*desc=*/[{ + Returns the APFloat semantics for this floating-point type. + }], + /*retTy=*/"const ::llvm::fltSemantics &", + /*methodName=*/"getFloatSemantics", + /*args=*/(ins) + >, + InterfaceMethod< + /*desc=*/[{ + Returns a float type with bitwidth scaled by `scale`. Returns a "null" + float type if the scaled element type cannot be represented. + }], + /*retTy=*/"::mlir::FloatType", + /*methodName=*/"scaleElementBitwidth", + /*args=*/(ins "unsigned":$scale), + /*methodBody=*/"", + /*defaultImplementation=*/"return ::mlir::FloatType();" + > + ]; + + let extraClassDeclaration = [{ + // Convenience factories. + static FloatType getBF16(MLIRContext *ctx); + static FloatType getF16(MLIRContext *ctx); + static FloatType getF32(MLIRContext *ctx); + static FloatType getTF32(MLIRContext *ctx); + static FloatType getF64(MLIRContext *ctx); + static FloatType getF80(MLIRContext *ctx); + static FloatType getF128(MLIRContext *ctx); + static FloatType getFloat8E5M2(MLIRContext *ctx); + static FloatType getFloat8E4M3(MLIRContext *ctx); + static FloatType getFloat8E4M3FN(MLIRContext *ctx); + static FloatType getFloat8E5M2FNUZ(MLIRContext *ctx); + static FloatType getFloat8E4M3FNUZ(MLIRContext *ctx); + static FloatType getFloat8E4M3B11FNUZ(MLIRContext *ctx); + static FloatType getFloat8E3M4(MLIRContext *ctx); + static FloatType getFloat4E2M1FN(MLIRContext *ctx); + static FloatType getFloat6E2M3FN(MLIRContext *ctx); + static FloatType getFloat6E3M2FN(MLIRContext *ctx); + static FloatType getFloat8E8M0FNU(MLIRContext *ctx); + + /// Return the bitwidth of this float type. + unsigned getWidth(); + + /// Return the width of the mantissa of this type. + /// The width includes the integer bit. + unsigned getFPMantissaWidth(); + }]; +} + //===----------------------------------------------------------------------===// // MemRefElementTypeInterface //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h index 7f9c470ffec30..2b3c2b6d1753d 100644 --- a/mlir/include/mlir/IR/BuiltinTypes.h +++ b/mlir/include/mlir/IR/BuiltinTypes.h @@ -25,7 +25,6 @@ struct fltSemantics; namespace mlir { class AffineExpr; class AffineMap; -class FloatType; class IndexType; class IntegerType; class MemRefType; @@ -44,52 +43,6 @@ template class ValueSemantics : public TypeTrait::TraitBase {}; -//===----------------------------------------------------------------------===// -// FloatType -//===----------------------------------------------------------------------===// - -class FloatType : public Type { -public: - using Type::Type; - - // Convenience factories. - static FloatType getBF16(MLIRContext *ctx); - static FloatType getF16(MLIRContext *ctx); - static FloatType getF32(MLIRContext *ctx); - static FloatType getTF32(MLIRContext *ctx); - static FloatType getF64(MLIRContext *ctx); - static FloatType getF80(MLIRContext *ctx); - static FloatType getF128(MLIRContext *ctx); - static FloatType getFloat8E5M2(MLIRContext *ctx); - static FloatType getFloat8E4M3(MLIRContext *ctx); - static FloatType getFloat8E4M3FN(MLIRContext *ctx); - static FloatType getFloat8E5M2FNUZ(MLIRContext *ctx); - static FloatType getFloat8E4M3FNUZ(MLIRContext *ctx); - static FloatType getFloat8E4M3B11FNUZ(MLIRContext *ctx); - static FloatType getFloat8E3M4(MLIRContext *ctx); - static FloatType getFloat4E2M1FN(MLIRContext *ctx); - static FloatType getFloat6E2M3FN(MLIRContext *ctx); - static FloatType getFloat6E3M2FN(MLIRContext *ctx); - static FloatType getFloat8E8M0FNU(MLIRContext *ctx); - - /// Methods for support type inquiry through isa, cast, and dyn_cast. - static bool classof(Type type); - - /// Return the bitwidth of this float type. - unsigned getWidth(); - - /// Return the width of the mantissa of this type. - /// The width includes the integer bit. - unsigned getFPMantissaWidth(); - - /// Get or create a new FloatType with bitwidth scaled by `scale`. - /// Return null if the scaled element type cannot be represented. - FloatType scaleElementBitwidth(unsigned scale); - - /// Return the floating semantics of this float type. - const llvm::fltSemantics &getFloatSemantics(); -}; - //===----------------------------------------------------------------------===// // TensorType //===----------------------------------------------------------------------===// @@ -448,15 +401,6 @@ inline bool BaseMemRefType::isValidElementType(Type type) { llvm::isa(type); } -inline bool FloatType::classof(Type type) { - return llvm::isa(type); -} - inline FloatType FloatType::getFloat4E2M1FN(MLIRContext *ctx) { return Float4E2M1FNType::get(ctx); } diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td index dca228097d782..fc50b28c09e41 100644 --- a/mlir/include/mlir/IR/BuiltinTypes.td +++ b/mlir/include/mlir/IR/BuiltinTypes.td @@ -79,8 +79,12 @@ def Builtin_Complex : Builtin_Type<"Complex", "complex"> { //===----------------------------------------------------------------------===// // Base class for Builtin dialect float types. -class Builtin_FloatType - : Builtin_Type { +class Builtin_FloatType declaredInterfaceMethods = []> + : Builtin_Type]> { let extraClassDeclaration = [{ static }] # name # [{Type get(MLIRContext *context); }]; @@ -322,14 +326,16 @@ def Builtin_Float8E8M0FNU : Builtin_FloatType<"Float8E8M0FNU", "f8E8M0FNU"> { //===----------------------------------------------------------------------===// // BFloat16Type -def Builtin_BFloat16 : Builtin_FloatType<"BFloat16", "bf16"> { +def Builtin_BFloat16 : Builtin_FloatType<"BFloat16", "bf16", + /*declaredInterfaceMethods=*/["scaleElementBitwidth"]> { let summary = "bfloat16 floating-point type"; } //===----------------------------------------------------------------------===// // Float16Type -def Builtin_Float16 : Builtin_FloatType<"Float16", "f16"> { +def Builtin_Float16 : Builtin_FloatType<"Float16", "f16", + /*declaredInterfaceMethods=*/["scaleElementBitwidth"]> { let summary = "16-bit floating-point type"; } @@ -343,7 +349,8 @@ def Builtin_FloatTF32 : Builtin_FloatType<"FloatTF32", "tf32"> { //===----------------------------------------------------------------------===// // Float32Type -def Builtin_Float32 : Builtin_FloatType<"Float32", "f32"> { +def Builtin_Float32 : Builtin_FloatType<"Float32", "f32", + /*declaredInterfaceMethods=*/["scaleElementBitwidth"]> { let summary = "32-bit floating-point type"; } diff --git a/mlir/include/mlir/Interfaces/ViewLikeInterface.h b/mlir/include/mlir/Interfaces/ViewLikeInterface.h index eb046bc742298..8f07e43f847ae 100644 --- a/mlir/include/mlir/Interfaces/ViewLikeInterface.h +++ b/mlir/include/mlir/Interfaces/ViewLikeInterface.h @@ -86,62 +86,100 @@ class OpWithOffsetSizesAndStridesConstantArgumentFolder final } }; -/// Printer hook for custom directive in assemblyFormat. +/// Printer hooks for custom directive in assemblyFormat. /// /// custom($values, $integers) /// custom($values, $integers, type($values)) /// -/// where `values` is of ODS type `Variadic<*>` and `integers` is of ODS -/// type `I64ArrayAttr`. Prints a list with either (1) the static integer value -/// in `integers` is `kDynamic` or (2) the next value otherwise. If `valueTypes` -/// is non-empty, it is expected to contain as many elements as `values` -/// indicating their types. This allows idiomatic printing of mixed value and -/// integer attributes in a list. E.g. -/// `[%arg0 : index, 7, 42, %arg42 : i32]`. -/// -/// Indices can be scalable. For example, "4" in "[2, [4], 8]" is scalable. -/// This notation is similar to how scalable dims are marked when defining -/// Vectors. For each value in `integers`, the corresponding `bool` in -/// `scalables` encodes whether it's a scalable index. If `scalableVals` is -/// empty then assume that all indices are non-scalable. +/// where `values` is of ODS type `Variadic<*>` and `integers` is of ODS type +/// `I64ArrayAttr`. Print a list where each element is either: +/// 1. the static integer value in `integers`, if it's not `kDynamic` or, +/// 2. the next value in `values`, otherwise. +/// +/// If `valueTypes` is provided, the corresponding type of each dynamic value is +/// printed. Otherwise, the type is not printed. Each type must match the type +/// of the corresponding value in `values`. `valueTypes` is redundant for +/// printing as we can retrieve the types from the actual `values`. However, +/// `valueTypes` is needed for parsing and we must keep the API symmetric for +/// parsing and printing. The type for integer elements is `i64` by default and +/// never printed. +/// +/// Integer indices can also be scalable in the context of scalable vectors, +/// denoted by square brackets (e.g., "[2, [4], 8]"). For each value in +/// `integers`, the corresponding `bool` in `scalableFlags` encodes whether it's +/// a scalable index. If `scalableFlags` is empty then assume that all indices +/// are non-scalable. +/// +/// Examples: +/// +/// * Input: `integers = [kDynamic, 7, 42, kDynamic]`, +/// `values = [%arg0, %arg42]` and +/// `valueTypes = [index, index]` +/// prints: +/// `[%arg0 : index, 7, 42, %arg42 : i32]` +/// +/// * Input: `integers = [kDynamic, 7, 42, kDynamic]`, +/// `values = [%arg0, %arg42]` and +/// `valueTypes = []` +/// prints: +/// `[%arg0, 7, 42, %arg42]` +/// +/// * Input: `integers = [2, 4, 8]`, +/// `values = []` and +/// `scalableFlags = [false, true, false]` +/// prints: +/// `[2, [4], 8]` +/// void printDynamicIndexList( OpAsmPrinter &printer, Operation *op, OperandRange values, - ArrayRef integers, ArrayRef scalables, + ArrayRef integers, ArrayRef scalableFlags, TypeRange valueTypes = TypeRange(), AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square); inline void printDynamicIndexList( OpAsmPrinter &printer, Operation *op, OperandRange values, ArrayRef integers, TypeRange valueTypes = TypeRange(), AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) { - return printDynamicIndexList(printer, op, values, integers, {}, valueTypes, - delimiter); + return printDynamicIndexList(printer, op, values, integers, + /*scalableFlags=*/{}, valueTypes, delimiter); } -/// Parser hook for custom directive in assemblyFormat. +/// Parser hooks for custom directive in assemblyFormat. /// /// custom($values, $integers) /// custom($values, $integers, type($values)) /// /// where `values` is of ODS type `Variadic<*>` and `integers` is of ODS -/// type `I64ArrayAttr`. Parse a mixed list with either (1) static integer -/// values or (2) SSA values. Fill `integers` with the integer ArrayAttr, where -/// `kDynamic` encodes the position of SSA values. Add the parsed SSA values -/// to `values` in-order. If `valueTypes` is non-null, fill it with types -/// corresponding to values; otherwise the caller must handle the types. -/// -/// E.g. after parsing "[%arg0 : index, 7, 42, %arg42 : i32]": -/// 1. `result` is filled with the i64 ArrayAttr "[`kDynamic`, 7, 42, -/// `kDynamic`]" -/// 2. `ssa` is filled with "[%arg0, %arg1]". -/// -/// Indices can be scalable. For example, "4" in "[2, [4], 8]" is scalable. -/// This notation is similar to how scalable dims are marked when defining -/// Vectors. For each value in `integers`, the corresponding `bool` in -/// `scalableVals` encodes whether it's a scalable index. +/// type `I64ArrayAttr`. Parse a mixed list where each element is either a +/// static integer or an SSA value. Fill `integers` with the integer ArrayAttr, +/// where `kDynamic` encodes the position of SSA values. Add the parsed SSA +/// values to `values` in-order. +/// +/// If `valueTypes` is provided, fill it with the types corresponding to each +/// value in `values`. Otherwise, the caller must handle the types and parsing +/// will fail if the type of the value is found (e.g., `[%arg0 : index, 3, %arg1 +/// : index]`). +/// +/// Integer indices can also be scalable in the context of scalable vectors, +/// denoted by square brackets (e.g., "[2, [4], 8]"). For each value in +/// `integers`, the corresponding `bool` in `scalableFlags` encodes whether it's +/// a scalable index. +/// +/// Examples: +/// +/// * After parsing "[%arg0 : index, 7, 42, %arg42 : i32]": +/// 1. `result` is filled with `[kDynamic, 7, 42, kDynamic]` +/// 2. `values` is filled with "[%arg0, %arg1]". +/// 3. `scalableFlags` is filled with `[false, true, false]`. +/// +/// * After parsing `[2, [4], 8]`: +/// 1. `result` is filled with `[2, 4, 8]` +/// 2. `values` is empty. +/// 3. `scalableFlags` is filled with `[false, true, false]`. +/// ParseResult parseDynamicIndexList( OpAsmParser &parser, SmallVectorImpl &values, - DenseI64ArrayAttr &integers, DenseBoolArrayAttr &scalableVals, + DenseI64ArrayAttr &integers, DenseBoolArrayAttr &scalableFlags, SmallVectorImpl *valueTypes = nullptr, AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square); inline ParseResult parseDynamicIndexList( @@ -149,8 +187,8 @@ inline ParseResult parseDynamicIndexList( SmallVectorImpl &values, DenseI64ArrayAttr &integers, SmallVectorImpl *valueTypes = nullptr, AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) { - DenseBoolArrayAttr scalableVals = {}; - return parseDynamicIndexList(parser, values, integers, scalableVals, + DenseBoolArrayAttr scalableFlags; + return parseDynamicIndexList(parser, values, integers, scalableFlags, valueTypes, delimiter); } diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 67dd21aafe4fe..8f5b49e0c2130 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -3889,7 +3889,7 @@ DiagnosedSilenceableFailure transform::WinogradConv2DOp::applyToOne( << "this operation is not supported to convert to Winograd Conv2D"; } - if (supported && failed(maybeTransformed)) { + if (failed(maybeTransformed)) { return emitSilenceableError() << "apply Winograd Conv2D failed"; } @@ -3927,7 +3927,7 @@ DiagnosedSilenceableFailure transform::DecomposeWinogradOp::applyToOne( return diag; } - if (supported && failed(maybeTransformed)) { + if (failed(maybeTransformed)) { DiagnosedSilenceableFailure diag = emitSilenceableError() << "decompose Winograd operations failed"; diag.attachNote(target->getLoc()) << "target op"; diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp index d04f302200519..95064083b21d4 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @@ -1090,15 +1090,20 @@ static LogicalResult alignedConversionPrecondition(PatternRewriter &rewriter, unsigned srcElemBitwidth = subByteVecType.getElementTypeBitWidth(); unsigned dstElemBitwidth = dstType.getElementTypeBitWidth(); - // Only {s}i4 -> (size_of({{s}i/f}) >= 8) are supported for now. - if (srcElemBitwidth != 4 || dstElemBitwidth < 8 || - (dstElemBitwidth % srcElemBitwidth) != 0) - return rewriter.notifyMatchFailure(op, "Not a supported aligned case"); + if (dstElemBitwidth < 8) + return rewriter.notifyMatchFailure( + op, "the bitwidth of dstType must be greater than or equal to 8"); + if (dstElemBitwidth % srcElemBitwidth != 0) + return rewriter.notifyMatchFailure(op, "unaligned cases are not supported"); + if (srcElemBitwidth != 2 && srcElemBitwidth != 4) + return rewriter.notifyMatchFailure( + op, "only src bitwidth of 2 or 4 is supported at this moment"); - const int numSrcElemsPerDestElem = dstElemBitwidth / srcElemBitwidth; - if ((subByteVecType.getShape().back() % numSrcElemsPerDestElem) != 0) + const int numSrcElemsPerByte = 8 / srcElemBitwidth; + if ((subByteVecType.getShape().back() % numSrcElemsPerByte) != 0) return rewriter.notifyMatchFailure( - op, "Not an even number of i4 elements in trailing dim"); + op, "the trailing dimension of the input vector of sub-bytes must be a " + "multiple of 8 / "); return success(); } @@ -1179,70 +1184,166 @@ Value BitCastRewriter::genericRewriteStep( return runningResult; } -/// Rewrite the i4 -> i8 signed extension into a sequence of shuffles and -/// bitwise ops that take advantage of high-level information to avoid leaving -/// LLVM to scramble with peephole optimizations. -static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc, - Value srcValue) { - VectorType srcVecType = cast(srcValue.getType()); - assert(srcVecType.getElementType().isSignlessInteger(4) && - "Expected i4 type"); +/// Bitcasts the aligned `subByteVec` vector to a vector of i8. +/// Where aligned means it satisfies the alignedConversionPreconditions. +/// +/// Example: +/// vector<16x16xi2> -> vector<16x4xi8> +/// vector<16x16xi4> -> vector<16x8xi8> +static Value bitcastSubByteVectorToI8(PatternRewriter &rewriter, Location loc, + Value subByteVec) { + auto srcVecType = cast(subByteVec.getType()); + int64_t srcBitwidth = srcVecType.getElementType().getIntOrFloatBitWidth(); + assert(8 % srcBitwidth == 0 && + "Unsupported sub-byte type (not a divisor of i8)"); + int64_t numSrcElemsPerByte = 8 / srcBitwidth; + SmallVector vecShape(srcVecType.getShape()); + // Adjust last dimension of the vector, so the total size remains the same. + vecShape.back() = vecShape.back() / numSrcElemsPerByte; + auto i8VecType = VectorType::get(vecShape, rewriter.getI8Type()); + return rewriter.create(loc, i8VecType, subByteVec); +} - // 1. Generate a bitcast vector -> vector. - SmallVector i8VecShape = llvm::to_vector(srcVecType.getShape()); - constexpr int64_t i4Toi8BitwidthFactor = 2; - i8VecShape.back() = i8VecShape.back() / i4Toi8BitwidthFactor; - auto i8VecType = VectorType::get(i8VecShape, rewriter.getI8Type()); - Value i8Vector = rewriter.create(loc, i8VecType, srcValue); +/// Extracts a signed N-bit sequence from each element of a vector of bytes, +/// starting at the specified bit index. +/// The `bitIdx` starts at 0 from the LSB and moves to the left. +/// +/// Example for a single element: +/// Extract numBits=2 starting at bitIdx=2 +/// src = [0 | 1 | 0 | 1 | 1 | 1 | 1 | 0] +/// indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] +/// target = [. . . . ^ ^ . .] +/// +/// The target sequence is [11](decimal=-1) as signed 2-bit integer. +/// So the result should be [11 11 11 11](decimal=-1) as signed 8-bit integer. +/// +/// src = [01 01 11 10] +/// shl = arith.shl(src, 4) -> [11 10 00 00] +/// result = arith.shrsi(shl, 6) -> [11 11 11 11] +static Value extractNBitsPerByteAndSignExtendToI8(PatternRewriter &rewriter, + Location loc, Value src, + int bitIdx, int numBits) { + auto srcType = cast(src.getType()); + Value shl = src; + int8_t bitsToShiftLeft = 8 - numBits - bitIdx; + assert(bitIdx >= 0 && bitsToShiftLeft >= 0 && numBits > 0 && numBits <= 8 && + "Invalid bitIdx range"); + if (bitsToShiftLeft != 0) { + Value shiftLeftValues = rewriter.create( + loc, DenseElementsAttr::get(srcType, bitsToShiftLeft)); + shl = rewriter.create(loc, src, shiftLeftValues); + } - // 2. Extend i4 elements to i8 elements using shifts. Low i4 elemens of each - // byte are place in one vector and the high i4 elements in another vector. - constexpr int8_t bitsToShift = 4; - auto shiftValues = rewriter.create( - loc, DenseElementsAttr::get(i8VecType, bitsToShift)); - Value shl = rewriter.create(loc, i8Vector, shiftValues); - Value low = rewriter.create(loc, shl, shiftValues); - Value high = rewriter.create(loc, i8Vector, shiftValues); + int8_t bitsToShiftRight = 8 - numBits; + Value shiftRightValues = rewriter.create( + loc, DenseElementsAttr::get(srcType, bitsToShiftRight)); + Value shr = rewriter.create(loc, shl, shiftRightValues); + return shr; +} - // 3. Interleave low and high i8 elements. - return rewriter.create(loc, low, high); +/// Extracts an unsigned N-bit sequence from each element of a vector of bytes, +/// starting at the specified bit index. +/// The `bitIdx` starts at 0 from the LSB and moves to the left. +/// +/// Example for a single element: +/// Extract numBits=2 starting at bitIdx=2 +/// src = [0 | 1 | 0 | 1 | 1 | 0 | 1 | 0] +/// indices = [7 | 6 | 5 | 4 | 3 | 2 | 1 | 0] +/// target = [. . . . ^ ^ . .] +/// +/// The target sequence is [10](decimal=2) as unsigned 2-bit integer. +/// So the result should be [00 00 00 10](decimal=2) as unsigned 8-bit integer. +/// +/// src = [01 01 10 10] +/// mask = [00 00 00 11] +/// shr = arith.shrui(src, 2) = [00 01 01 10] +/// result = arith.andi(shr, mask) = [00 00 00 10] +/// NOTE: Similarly to extractNBitsPerByteAndSignExtendToI8, this could be +/// achieved by using arith::ShLIOp + arith::ShRUIOp instead of the masking. +/// However, by using arith::ShRUIOp + arith::AndIOp, we are eliminating shift +/// left when the index is 0. +static Value extractNBitsPerByteAndExtendToI8(PatternRewriter &rewriter, + Location loc, Value src, + int bitIdx, int numBits) { + assert(bitIdx >= 0 && bitIdx <= 8 - numBits && numBits > 0 && numBits <= 8 && + "Invalid bitIdx range"); + auto srcType = cast(src.getType()); + int8_t bitsToShiftRight = bitIdx; + Value shr = src; + if (bitsToShiftRight != 0) { + Value shiftRightValues = rewriter.create( + loc, DenseElementsAttr::get(srcType, bitsToShiftRight)); + shr = rewriter.create(loc, src, shiftRightValues); + } + if (bitIdx + numBits == 8) { + return shr; + } + uint8_t lowBitsMask = (1 << numBits) - 1; + Value lowBitsMaskValues = rewriter.create( + loc, DenseElementsAttr::get(srcType, lowBitsMask)); + return rewriter.create(loc, shr, lowBitsMaskValues); } -/// Rewrite the i4 -> i8 unsigned extension into a sequence of shuffles and -/// bitwise ops that take advantage of high-level information to avoid leaving -/// LLVM to scramble with peephole optimizations. -static Value rewriteI4ToI8UnsignedExt(PatternRewriter &rewriter, Location loc, - Value srcValue) { - VectorType srcVecType = cast(srcValue.getType()); +using ExtractNBitsFn = + std::function; + +/// Rewrite the i4 -> i8 extension into a sequence of shuffles and +/// bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. +static Value rewriteI4ToI8Ext(PatternRewriter &rewriter, Location loc, + Value srcValue, const ExtractNBitsFn &extFn) { + [[maybe_unused]] auto srcVecType = cast(srcValue.getType()); assert(srcVecType.getElementType().isSignlessInteger(4) && "Expected i4 type"); // 1. Generate a bitcast vector -> vector. - SmallVector i8VecShape = llvm::to_vector(srcVecType.getShape()); - constexpr int64_t i4Toi8BitwidthFactor = 2; - i8VecShape.back() = i8VecShape.back() / i4Toi8BitwidthFactor; - auto i8VecType = VectorType::get(i8VecShape, rewriter.getI8Type()); - Value i8Vector = rewriter.create(loc, i8VecType, srcValue); - - // 2 Extend the i4 elements using shifts & masking. Low i4 elements of each - // byte are placed in one vector and the high i4 elements in another vector. - constexpr uint8_t lowBitsMask = 15; // Equivalent to [00001111] bit mask - auto lowBitsMaskValues = rewriter.create( - loc, DenseElementsAttr::get(i8VecType, lowBitsMask)); - Value low = rewriter.create(loc, i8VecType, i8Vector, - lowBitsMaskValues); - constexpr int8_t highBitsToShift = 4; - auto highShiftValues = rewriter.create( - loc, DenseElementsAttr::get(i8VecType, highBitsToShift)); - Value high = rewriter.create(loc, i8Vector, highShiftValues); + Value i8Vector = bitcastSubByteVectorToI8(rewriter, loc, srcValue); + + // 2. Extend i4 elements to i8 elements. Low i4 elemens of each + // byte are place in one vector and the high i4 elements in another vector. + Value low = extFn(rewriter, loc, i8Vector, 0, 4); + Value high = extFn(rewriter, loc, i8Vector, 4, 4); // 3. Interleave low and high i8 elements. return rewriter.create(loc, low, high); } +/// Rewrite the i2 -> i8 extension into a sequence of shuffles and +/// bitwise ops to avoid leaving LLVM to scramble with peephole optimizations. +static Value rewriteI2ToI8Ext(PatternRewriter &rewriter, Location loc, + Value srcValue, const ExtractNBitsFn &extFn) { + [[maybe_unused]] VectorType srcVecType = cast(srcValue.getType()); + assert(srcVecType.getElementType().isSignlessInteger(2) && + "Expected i2 type"); + + // 1. Generate a bitcast vector -> vector. + Value i8Vector = bitcastSubByteVectorToI8(rewriter, loc, srcValue); + + // 2. Extract each i2 element + // Positon 0 (bits 0-1) + Value vec0 = extFn(rewriter, loc, i8Vector, 0, 2); + // Position 1 (bits 2-3) + Value vec1 = extFn(rewriter, loc, i8Vector, 2, 2); + // Position 2 (bits 4-5) + Value vec2 = extFn(rewriter, loc, i8Vector, 4, 2); + // Position 3 (bits 6-7) + Value vec3 = extFn(rewriter, loc, i8Vector, 6, 2); + + // 3. Interleave all 4 elements by first interleaving + // even elements and then odd + // vec0 = [0,0,0,0],... + // vec1 = [1,1,1,1],... + // vec2 = [2,2,2,2],... + // vec3 = [3,3,3,3],... + // 02 = [0,2,0,2,0,2,0,2],... + // 13 = [1,3,1,3,1,3,1,3],... + // 0213 = [0,1,2,3,...],... + Value interleave02 = rewriter.create(loc, vec0, vec2); + Value interleave13 = rewriter.create(loc, vec1, vec3); + return rewriter.create(loc, interleave02, interleave13); +} + /// Rewrite the i8 -> i4 truncation into a deinterleave and series of bitwise -/// ops that take advantage of high-level information to avoid leaving LLVM to -/// scramble with peephole optimizations. +/// ops to avoid leaving LLVM to scramble with peephole optimizations. static Value rewriteI8ToI4Trunc(PatternRewriter &rewriter, Location loc, Value srcValue) { VectorType srcVecType = cast(srcValue.getType()); @@ -1443,13 +1544,19 @@ struct RewriteAlignedSubByteIntExt : OpRewritePattern { return failure(); // Perform the rewrite. + Location loc = conversionOp.getLoc(); + const auto &extFn = isSigned ? extractNBitsPerByteAndSignExtendToI8 + : extractNBitsPerByteAndExtendToI8; Value subByteExt; - if (isSigned) { - subByteExt = - rewriteI4ToI8SignedExt(rewriter, conversionOp.getLoc(), srcValue); - } else { - subByteExt = - rewriteI4ToI8UnsignedExt(rewriter, conversionOp.getLoc(), srcValue); + switch (srcVecType.getElementType().getIntOrFloatBitWidth()) { + case 2: + subByteExt = rewriteI2ToI8Ext(rewriter, loc, srcValue, extFn); + break; + case 4: + subByteExt = rewriteI4ToI8Ext(rewriter, loc, srcValue, extFn); + break; + default: + return failure(); } // Finalize the rewrite. @@ -1490,6 +1597,10 @@ struct RewriteAlignedSubByteIntTrunc : OpRewritePattern { if (failed(commonConversionPrecondition(rewriter, srcVecType, truncOp))) return failure(); + // TODO: Add support for truncating to i2. + if (dstVecType.getElementType().getIntOrFloatBitWidth() == 2) + return failure(); + // Check general alignment preconditions. We invert the src/dst type order // to reuse the existing precondition logic. if (failed(alignedConversionPrecondition(rewriter, dstVecType, srcVecType, diff --git a/mlir/lib/IR/BuiltinTypeInterfaces.cpp b/mlir/lib/IR/BuiltinTypeInterfaces.cpp index ab9e65b5edfed..c663f6c909460 100644 --- a/mlir/lib/IR/BuiltinTypeInterfaces.cpp +++ b/mlir/lib/IR/BuiltinTypeInterfaces.cpp @@ -8,6 +8,7 @@ #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Diagnostics.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/Sequence.h" using namespace mlir; @@ -19,6 +20,18 @@ using namespace mlir::detail; #include "mlir/IR/BuiltinTypeInterfaces.cpp.inc" +//===----------------------------------------------------------------------===// +// FloatType +//===----------------------------------------------------------------------===// + +unsigned FloatType::getWidth() { + return APFloat::semanticsSizeInBits(getFloatSemantics()); +} + +unsigned FloatType::getFPMantissaWidth() { + return APFloat::semanticsPrecision(getFloatSemantics()); +} + //===----------------------------------------------------------------------===// // ShapedType //===----------------------------------------------------------------------===// diff --git a/mlir/lib/IR/BuiltinTypes.cpp b/mlir/lib/IR/BuiltinTypes.cpp index 6546234429c8c..41b794bc0aec5 100644 --- a/mlir/lib/IR/BuiltinTypes.cpp +++ b/mlir/lib/IR/BuiltinTypes.cpp @@ -87,72 +87,54 @@ IntegerType IntegerType::scaleElementBitwidth(unsigned scale) { } //===----------------------------------------------------------------------===// -// Float Type -//===----------------------------------------------------------------------===// - -unsigned FloatType::getWidth() { - return APFloat::semanticsSizeInBits(getFloatSemantics()); -} - -/// Returns the floating semantics for the given type. -const llvm::fltSemantics &FloatType::getFloatSemantics() { - if (llvm::isa(*this)) - return APFloat::Float4E2M1FN(); - if (llvm::isa(*this)) - return APFloat::Float6E2M3FN(); - if (llvm::isa(*this)) - return APFloat::Float6E3M2FN(); - if (llvm::isa(*this)) - return APFloat::Float8E5M2(); - if (llvm::isa(*this)) - return APFloat::Float8E4M3(); - if (llvm::isa(*this)) - return APFloat::Float8E4M3FN(); - if (llvm::isa(*this)) - return APFloat::Float8E5M2FNUZ(); - if (llvm::isa(*this)) - return APFloat::Float8E4M3FNUZ(); - if (llvm::isa(*this)) - return APFloat::Float8E4M3B11FNUZ(); - if (llvm::isa(*this)) - return APFloat::Float8E3M4(); - if (llvm::isa(*this)) - return APFloat::Float8E8M0FNU(); - if (llvm::isa(*this)) - return APFloat::BFloat(); - if (llvm::isa(*this)) - return APFloat::IEEEhalf(); - if (llvm::isa(*this)) - return APFloat::FloatTF32(); - if (llvm::isa(*this)) - return APFloat::IEEEsingle(); - if (llvm::isa(*this)) - return APFloat::IEEEdouble(); - if (llvm::isa(*this)) - return APFloat::x87DoubleExtended(); - if (llvm::isa(*this)) - return APFloat::IEEEquad(); - llvm_unreachable("non-floating point type used"); -} - -FloatType FloatType::scaleElementBitwidth(unsigned scale) { - if (!scale) - return FloatType(); - MLIRContext *ctx = getContext(); - if (isF16() || isBF16()) { - if (scale == 2) - return FloatType::getF32(ctx); - if (scale == 4) - return FloatType::getF64(ctx); +// Float Types +//===----------------------------------------------------------------------===// + +// Mapping from MLIR FloatType to APFloat semantics. +#define FLOAT_TYPE_SEMANTICS(TYPE, SEM) \ + const llvm::fltSemantics &TYPE::getFloatSemantics() const { \ + return APFloat::SEM(); \ } - if (isF32()) - if (scale == 2) - return FloatType::getF64(ctx); +FLOAT_TYPE_SEMANTICS(Float4E2M1FNType, Float4E2M1FN) +FLOAT_TYPE_SEMANTICS(Float6E2M3FNType, Float6E2M3FN) +FLOAT_TYPE_SEMANTICS(Float6E3M2FNType, Float6E3M2FN) +FLOAT_TYPE_SEMANTICS(Float8E5M2Type, Float8E5M2) +FLOAT_TYPE_SEMANTICS(Float8E4M3Type, Float8E4M3) +FLOAT_TYPE_SEMANTICS(Float8E4M3FNType, Float8E4M3FN) +FLOAT_TYPE_SEMANTICS(Float8E5M2FNUZType, Float8E5M2FNUZ) +FLOAT_TYPE_SEMANTICS(Float8E4M3FNUZType, Float8E4M3FNUZ) +FLOAT_TYPE_SEMANTICS(Float8E4M3B11FNUZType, Float8E4M3B11FNUZ) +FLOAT_TYPE_SEMANTICS(Float8E3M4Type, Float8E3M4) +FLOAT_TYPE_SEMANTICS(Float8E8M0FNUType, Float8E8M0FNU) +FLOAT_TYPE_SEMANTICS(BFloat16Type, BFloat) +FLOAT_TYPE_SEMANTICS(Float16Type, IEEEhalf) +FLOAT_TYPE_SEMANTICS(FloatTF32Type, FloatTF32) +FLOAT_TYPE_SEMANTICS(Float32Type, IEEEsingle) +FLOAT_TYPE_SEMANTICS(Float64Type, IEEEdouble) +FLOAT_TYPE_SEMANTICS(Float80Type, x87DoubleExtended) +FLOAT_TYPE_SEMANTICS(Float128Type, IEEEquad) +#undef FLOAT_TYPE_SEMANTICS + +FloatType Float16Type::scaleElementBitwidth(unsigned scale) const { + if (scale == 2) + return FloatType::getF32(getContext()); + if (scale == 4) + return FloatType::getF64(getContext()); return FloatType(); } -unsigned FloatType::getFPMantissaWidth() { - return APFloat::semanticsPrecision(getFloatSemantics()); +FloatType BFloat16Type::scaleElementBitwidth(unsigned scale) const { + if (scale == 2) + return FloatType::getF32(getContext()); + if (scale == 4) + return FloatType::getF64(getContext()); + return FloatType(); +} + +FloatType Float32Type::scaleElementBitwidth(unsigned scale) const { + if (scale == 2) + return FloatType::getF64(getContext()); + return FloatType(); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Interfaces/ViewLikeInterface.cpp b/mlir/lib/Interfaces/ViewLikeInterface.cpp index ca33636336bf0..57b5cce7bb13b 100644 --- a/mlir/lib/Interfaces/ViewLikeInterface.cpp +++ b/mlir/lib/Interfaces/ViewLikeInterface.cpp @@ -113,7 +113,8 @@ static char getRightDelimiter(AsmParser::Delimiter delimiter) { void mlir::printDynamicIndexList(OpAsmPrinter &printer, Operation *op, OperandRange values, ArrayRef integers, - ArrayRef scalables, TypeRange valueTypes, + ArrayRef scalableFlags, + TypeRange valueTypes, AsmParser::Delimiter delimiter) { char leftDelimiter = getLeftDelimiter(delimiter); char rightDelimiter = getRightDelimiter(delimiter); @@ -126,7 +127,7 @@ void mlir::printDynamicIndexList(OpAsmPrinter &printer, Operation *op, unsigned dynamicValIdx = 0; unsigned scalableIndexIdx = 0; llvm::interleaveComma(integers, printer, [&](int64_t integer) { - if (!scalables.empty() && scalables[scalableIndexIdx]) + if (!scalableFlags.empty() && scalableFlags[scalableIndexIdx]) printer << "["; if (ShapedType::isDynamic(integer)) { printer << values[dynamicValIdx]; @@ -136,7 +137,7 @@ void mlir::printDynamicIndexList(OpAsmPrinter &printer, Operation *op, } else { printer << integer; } - if (!scalables.empty() && scalables[scalableIndexIdx]) + if (!scalableFlags.empty() && scalableFlags[scalableIndexIdx]) printer << "]"; scalableIndexIdx++; @@ -148,7 +149,7 @@ void mlir::printDynamicIndexList(OpAsmPrinter &printer, Operation *op, ParseResult mlir::parseDynamicIndexList( OpAsmParser &parser, SmallVectorImpl &values, - DenseI64ArrayAttr &integers, DenseBoolArrayAttr &scalables, + DenseI64ArrayAttr &integers, DenseBoolArrayAttr &scalableFlags, SmallVectorImpl *valueTypes, AsmParser::Delimiter delimiter) { SmallVector integerVals; @@ -183,7 +184,7 @@ ParseResult mlir::parseDynamicIndexList( return parser.emitError(parser.getNameLoc()) << "expected SSA value or integer"; integers = parser.getBuilder().getDenseI64ArrayAttr(integerVals); - scalables = parser.getBuilder().getDenseBoolArrayAttr(scalableVals); + scalableFlags = parser.getBuilder().getDenseBoolArrayAttr(scalableVals); return success(); } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 0be515e63b470..c7dce5d6c6556 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -183,10 +183,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { result = op.emitError("not yet implemented: host evaluation of loop " "bounds in omp.target operation"); }; - auto checkIf = [&todo](auto op, LogicalResult &result) { - if (op.getIfExpr()) - result = todo("if"); - }; auto checkInReduction = [&todo](auto op, LogicalResult &result) { if (!op.getInReductionVars().empty() || op.getInReductionByref() || op.getInReductionSyms()) @@ -306,7 +302,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { checkDevice(op, result); checkHasDeviceAddr(op, result); checkHostEval(op, result); - checkIf(op, result); checkInReduction(op, result); checkIsDevicePtr(op, result); checkPrivate(op, result); @@ -1349,13 +1344,23 @@ allocatePrivateVars(llvm::IRBuilderBase &builder, llvm::SmallVectorImpl &llvmPrivateVars, const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, llvm::DenseMap *mappedPrivateVars = nullptr) { - llvm::IRBuilderBase::InsertPointGuard guard(builder); // Allocate private vars llvm::BranchInst *allocaTerminator = llvm::cast(allocaIP.getBlock()->getTerminator()); + if (allocaTerminator->getNumSuccessors() != 1) { + splitBB(llvm::OpenMPIRBuilder::InsertPointTy( + allocaIP.getBlock(), allocaTerminator->getIterator()), + true, "omp.region.after_alloca"); + } + + llvm::IRBuilderBase::InsertPointGuard guard(builder); + // Update the allocaTerminator in case the alloca block was split above. + allocaTerminator = + llvm::cast(allocaIP.getBlock()->getTerminator()); builder.SetInsertPoint(allocaTerminator); assert(allocaTerminator->getNumSuccessors() == 1 && "This is an unconditional branch created by OpenMPIRBuilder"); + llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0); // FIXME: Some of the allocation regions do more than just allocating. @@ -1885,11 +1890,6 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, SmallVector privateReductionVariables( wsloopOp.getNumReductionVars()); - splitBB(llvm::OpenMPIRBuilder::InsertPointTy( - allocaIP.getBlock(), - allocaIP.getBlock()->getTerminator()->getIterator()), - true, "omp.region.after_alloca"); - llvm::Expected afterAllocas = allocatePrivateVars( builder, moduleTranslation, privateBlockArgs, privateDecls, mlirPrivateVars, llvmPrivateVars, allocaIP); @@ -4378,10 +4378,14 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, findAllocaInsertPoint(builder, moduleTranslation); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); + llvm::Value *ifCond = nullptr; + if (Value targetIfCond = targetOp.getIfExpr()) + ifCond = moduleTranslation.lookupValue(targetIfCond); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = moduleTranslation.getOpenMPBuilder()->createTarget( ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), entryInfo, - defaultAttrs, runtimeAttrs, kernelInput, genMapInfoCB, bodyCB, + defaultAttrs, runtimeAttrs, ifCond, kernelInput, genMapInfoCB, bodyCB, argAccessorCB, dds, targetOp.getNowait()); if (failed(handleError(afterIP, opInst))) diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir index 210025e30d7db..8d28f248e392d 100644 --- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir +++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir @@ -193,6 +193,25 @@ func.func @f3ext(%a: vector<5xi8>) -> vector<8xi17> { return %1 : vector<8xi17> } + +// Negative test - the trailing dim 1 is not a multiple of 2 (i.e. 8 / 4). +// CHECK-LABEL: func.func @unaligned_extsi_i4_to_i8( +func.func @unaligned_extsi_i4_to_i8(%a: vector<1xi4>) -> vector<1xi8> { + // CHECK-NOT: arith.bitcast + // CHECK: arith.extsi %[[IN:.*]] : vector<1xi4> to vector<1xi8> + %0 = arith.extsi %a : vector<1xi4> to vector<1xi8> + return %0 : vector<1xi8> +} + +// Negative test - the trailing dim 2 is not a multiple of 4 (i.e. 8 / 2). +// CHECK-LABEL: func.func @unaligned_extsi_i2_to_i8( +func.func @unaligned_extsi_i2_to_i8(%a: vector<2xi2>) -> vector<2xi8> { + // CHECK-NOT: arith.bitcast + // CHECK: arith.extsi %[[IN:.*]] : vector<2xi2> to vector<2xi8> + %0 = arith.extsi %a : vector<2xi2> to vector<2xi8> + return %0 : vector<2xi8> +} + // CHECK-LABEL: func.func @aligned_extsi_i4_to_i8( func.func @aligned_extsi_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> { @@ -206,6 +225,31 @@ func.func @aligned_extsi_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { return %0 : vector<8xi8> } +// CHECK-LABEL: func.func @aligned_extsi_i2_to_i8( +func.func @aligned_extsi_i2_to_i8(%a: vector<8xi2>) -> vector<8xi8> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi2>) -> vector<8xi8> { +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<2xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<2xi8> +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<2xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi2> to vector<2xi8> +// Extract bits 0-1 +// CHECK: %[[SHL_6:.*]] = arith.shli %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[ELEM0:.*]] = arith.shrsi %[[SHL_6]], %[[CST_6]] : vector<2xi8> +// Extract bits 2-3 +// CHECK: %[[SHL_4:.*]] = arith.shli %[[BITCAST]], %[[CST_4]] : vector<2xi8> +// CHECK: %[[ELEM1:.*]] = arith.shrsi %[[SHL_4]], %[[CST_6]] : vector<2xi8> +// Extract bits 4-5 +// CHECK: %[[SHL_2:.*]] = arith.shli %[[BITCAST]], %[[CST_2]] : vector<2xi8> +// CHECK: %[[ELEM2:.*]] = arith.shrsi %[[SHL_2]], %[[CST_6]] : vector<2xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrsi %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<2xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<2xi8> +// CHECK: %[[RESULT:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<4xi8> + %0 = arith.extsi %a : vector<8xi2> to vector<8xi8> + return %0 : vector<8xi8> +} + // CHECK-LABEL: func.func @aligned_extsi_i4_to_i32( func.func @aligned_extsi_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> { @@ -220,8 +264,34 @@ func.func @aligned_extsi_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { return %0 : vector<8xi32> } -// CHECK-LABEL: func.func @aligned_extsi_2d( -func.func @aligned_extsi_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { +// CHECK-LABEL: func.func @aligned_extsi_i2_to_i32( +func.func @aligned_extsi_i2_to_i32(%a: vector<8xi2>) -> vector<8xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi2>) -> vector<8xi32> { +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<2xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<2xi8> +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<2xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi2> to vector<2xi8> +// Extract bits 0-1 +// CHECK: %[[SHL_6:.*]] = arith.shli %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[ELEM0:.*]] = arith.shrsi %[[SHL_6]], %[[CST_6]] : vector<2xi8> +// Extract bits 2-3 +// CHECK: %[[SHL_4:.*]] = arith.shli %[[BITCAST]], %[[CST_4]] : vector<2xi8> +// CHECK: %[[ELEM1:.*]] = arith.shrsi %[[SHL_4]], %[[CST_6]] : vector<2xi8> +// Extract bits 4-5 +// CHECK: %[[SHL_2:.*]] = arith.shli %[[BITCAST]], %[[CST_2]] : vector<2xi8> +// CHECK: %[[ELEM2:.*]] = arith.shrsi %[[SHL_2]], %[[CST_6]] : vector<2xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrsi %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<2xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<2xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<4xi8> +// CHECK: %[[RESULT:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8xi8> to vector<8xi32> + %0 = arith.extsi %a : vector<8xi2> to vector<8xi32> + return %0 : vector<8xi32> +} + +// CHECK-LABEL: func.func @aligned_extsi_i4_to_i32_2d( +func.func @aligned_extsi_i4_to_i32_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { // CHECK-SAME: %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xi32> { // CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> // CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8> @@ -234,6 +304,32 @@ func.func @aligned_extsi_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { return %0 : vector<8x32xi32> } +// CHECK-LABEL: func.func @aligned_extsi_i2_to_i32_2d( +func.func @aligned_extsi_i2_to_i32_2d(%a: vector<8x32xi2>) -> vector<8x32xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8x32xi2>) -> vector<8x32xi32> { +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<8x8xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<8x8xi8> +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<8x8xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi2> to vector<8x8xi8> +// Extract bits 0-1 +// CHECK: %[[SHL_6:.*]] = arith.shli %[[BITCAST]], %[[CST_6]] : vector<8x8xi8> +// CHECK: %[[ELEM0:.*]] = arith.shrsi %[[SHL_6]], %[[CST_6]] : vector<8x8xi8> +// Extract bits 2-3 +// CHECK: %[[SHL_4:.*]] = arith.shli %[[BITCAST]], %[[CST_4]] : vector<8x8xi8> +// CHECK: %[[ELEM1:.*]] = arith.shrsi %[[SHL_4]], %[[CST_6]] : vector<8x8xi8> +// Extract bits 4-5 +// CHECK: %[[SHL_2:.*]] = arith.shli %[[BITCAST]], %[[CST_2]] : vector<8x8xi8> +// CHECK: %[[ELEM2:.*]] = arith.shrsi %[[SHL_2]], %[[CST_6]] : vector<8x8xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrsi %[[BITCAST]], %[[CST_6]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<8x16xi8> +// CHECK: %[[RESULT:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xi32> + %0 = arith.extsi %a : vector<8x32xi2> to vector<8x32xi32> + return %0 : vector<8x32xi32> +} + // CHECK-LABEL: func.func @aligned_trunci_i8_to_i4( func.func @aligned_trunci_i8_to_i4(%a: vector<8xi8>) -> vector<8xi4> { @@ -292,6 +388,13 @@ func.func @aligned_trunci_nd(%a: vector<3x8x32xi32>) -> vector<3x8x32xi4> { return %0 : vector<3x8x32xi4> } +func.func @aligned_trunci_i8_to_i2_no_match(%a: vector<8xi8>) -> vector<8xi2> { + // CHECK-NOT: arith.bitcast + // CHECK: arith.trunci %[[IN:.*]] : vector<8xi8> to vector<8xi2> + %0 = arith.trunci %a : vector<8xi8> to vector<8xi2> + return %0 : vector<8xi2> +} + // CHECK-LABEL: func.func @aligned_extui_i4_to_i8( func.func @aligned_extui_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> { @@ -305,6 +408,31 @@ func.func @aligned_extui_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { return %0 : vector<8xi8> } +// CHECK-LABEL: func.func @aligned_extui_i2_to_i8( +func.func @aligned_extui_i2_to_i8(%a: vector<8xi2>) -> vector<8xi8> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi2>) -> vector<8xi8> { +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<2xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<2xi8> +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<2xi8> +// CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<3> : vector<2xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi2> to vector<2xi8> +// Extract bits 0-1 +// CHECK: %[[ELEM0:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 2-3 +// CHECK: %[[SHR_2:.*]] = arith.shrui %[[BITCAST]], %[[CST_2]] : vector<2xi8> +// CHECK: %[[ELEM1:.*]] = arith.andi %[[SHR_2]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 4-5 +// CHECK: %[[SHR_4:.*]] = arith.shrui %[[BITCAST]], %[[CST_4]] : vector<2xi8> +// CHECK: %[[ELEM2:.*]] = arith.andi %[[SHR_4]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrui %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<2xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<2xi8> +// CHECK: %[[RESULT:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<4xi8> + %0 = arith.extui %a : vector<8xi2> to vector<8xi8> + return %0 : vector<8xi8> +} + // CHECK-LABEL: func.func @aligned_extui_i4_to_i32( func.func @aligned_extui_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> { @@ -319,8 +447,34 @@ func.func @aligned_extui_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { return %0 : vector<8xi32> } -// CHECK-LABEL: func.func @aligned_extui_2d( -func.func @aligned_extui_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { +// CHECK-LABEL: func.func @aligned_extui_i2_to_i32( +func.func @aligned_extui_i2_to_i32(%a: vector<8xi2>) -> vector<8xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi2>) -> vector<8xi32> { +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<2xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<2xi8> +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<2xi8> +// CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<3> : vector<2xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi2> to vector<2xi8> +// Extract bits 0-1 +// CHECK: %[[ELEM0:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 2-3 +// CHECK: %[[SHR_2:.*]] = arith.shrui %[[BITCAST]], %[[CST_2]] : vector<2xi8> +// CHECK: %[[ELEM1:.*]] = arith.andi %[[SHR_2]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 4-5 +// CHECK: %[[SHR_4:.*]] = arith.shrui %[[BITCAST]], %[[CST_4]] : vector<2xi8> +// CHECK: %[[ELEM2:.*]] = arith.andi %[[SHR_4]], %[[LOWBITS_MASK]] : vector<2xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrui %[[BITCAST]], %[[CST_6]] : vector<2xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<2xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<2xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<4xi8> +// CHECK: %[[RESULT:.*]] = arith.extui %[[INTERLEAVE]] : vector<8xi8> to vector<8xi32> + %0 = arith.extui %a : vector<8xi2> to vector<8xi32> + return %0 : vector<8xi32> +} + +// CHECK-LABEL: func.func @aligned_extui_i4_to_i32_2d( +func.func @aligned_extui_i4_to_i32_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { // CHECK-SAME: %[[VAL_0:.*]]: vector<8x32xi4>) -> vector<8x32xi32> { // CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> // CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<15> : vector<8x16xi8> @@ -333,6 +487,32 @@ func.func @aligned_extui_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { return %0 : vector<8x32xi32> } +// CHECK-LABEL: func.func @aligned_extui_i2_to_i32_2d( +func.func @aligned_extui_i2_to_i32_2d(%a: vector<8x32xi2>) -> vector<8x32xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8x32xi2>) -> vector<8x32xi32> { +// CHECK: %[[CST_6:.*]] = arith.constant dense<6> : vector<8x8xi8> +// CHECK: %[[CST_4:.*]] = arith.constant dense<4> : vector<8x8xi8> +// CHECK: %[[CST_2:.*]] = arith.constant dense<2> : vector<8x8xi8> +// CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<3> : vector<8x8xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi2> to vector<8x8xi8> +// Extract bits 0-1 +// CHECK: %[[ELEM0:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<8x8xi8> +// Extract bits 2-3 +// CHECK: %[[SHR_2:.*]] = arith.shrui %[[BITCAST]], %[[CST_2]] : vector<8x8xi8> +// CHECK: %[[ELEM1:.*]] = arith.andi %[[SHR_2]], %[[LOWBITS_MASK]] : vector<8x8xi8> +// Extract bits 4-5 +// CHECK: %[[SHR_4:.*]] = arith.shrui %[[BITCAST]], %[[CST_4]] : vector<8x8xi8> +// CHECK: %[[ELEM2:.*]] = arith.andi %[[SHR_4]], %[[LOWBITS_MASK]] : vector<8x8xi8> +// Extract bits 6-7 +// CHECK: %[[ELEM3:.*]] = arith.shrui %[[BITCAST]], %[[CST_6]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE02:.*]] = vector.interleave %[[ELEM0]], %[[ELEM2]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE13:.*]] = vector.interleave %[[ELEM1]], %[[ELEM3]] : vector<8x8xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[INTERLEAVE02]], %[[INTERLEAVE13]] : vector<8x16xi8> +// CHECK: %[[RESULT:.*]] = arith.extui %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xi32> + %0 = arith.extui %a : vector<8x32xi2> to vector<8x32xi32> + return %0 : vector<8x32xi32> +} + // CHECK-LABEL: func.func @aligned_sitofp( func.func @aligned_sitofp(%a: vector<8xi4>) -> vector<8xf32> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xf32> { diff --git a/mlir/test/Dialect/Vector/vector-sink.mlir b/mlir/test/Dialect/Vector/vector-sink.mlir index 5a3699333265c..7ce840575a803 100644 --- a/mlir/test/Dialect/Vector/vector-sink.mlir +++ b/mlir/test/Dialect/Vector/vector-sink.mlir @@ -228,6 +228,16 @@ func.func @broadcast_vector_extsi(%a : vector<4xi8>) -> vector<2x4xi32> { // ----- +func.func @broadcast_vector_extsi_scalable(%a : vector<[4]xi8>) -> vector<2x[4]xi32> { + // CHECK: %[[EXT:.+]] = arith.extsi %{{.+}} : vector<[4]xi8> to vector<[4]xi32> + // CHECK: vector.broadcast %[[EXT:.+]] : vector<[4]xi32> to vector<2x[4]xi32> + %b = vector.broadcast %a : vector<[4]xi8> to vector<2x[4]xi8> + %r = arith.extsi %b : vector<2x[4]xi8> to vector<2x[4]xi32> + return %r : vector<2x[4]xi32> +} + +// ----- + func.func @broadcast_scalar_extsi(%a : i8) -> vector<2x4xi32> { // CHECK: %[[EXT:.+]] = arith.extsi %{{.+}} : i8 to i32 // CHECK: vector.broadcast %[[EXT]] : i32 to vector<2x4xi32> @@ -236,6 +246,16 @@ func.func @broadcast_scalar_extsi(%a : i8) -> vector<2x4xi32> { return %r : vector<2x4xi32> } +// ----- + +func.func @broadcast_scalar_extsi_scalable(%a : i8) -> vector<2x[4]xi32> { + // CHECK: %[[EXT:.+]] = arith.extsi %{{.+}} : i8 to i32 + // CHECK: vector.broadcast %[[EXT]] : i32 to vector<2x[4]xi32> + %b = vector.broadcast %a : i8 to vector<2x[4]xi8> + %r = arith.extsi %b : vector<2x[4]xi8> to vector<2x[4]xi32> + return %r : vector<2x[4]xi32> +} + //===----------------------------------------------------------------------===// // [Pattern: ReorderElementwiseOpsOnTranspose] //===----------------------------------------------------------------------===// @@ -250,6 +270,16 @@ func.func @transpose_extsi(%a : vector<4x2xi8>) -> vector<2x4xi32> { // ----- +func.func @transpose_extsi_scalable(%a : vector<[4]x2xi8>) -> vector<2x[4]xi32> { + // CHECK: %[[EXT:.+]] = arith.extsi %{{.+}} : vector<[4]x2xi8> to vector<[4]x2xi32> + // CHECK: vector.transpose %[[EXT]], [1, 0] : vector<[4]x2xi32> to vector<2x[4]xi32> + %b = vector.transpose %a, [1, 0]: vector<[4]x2xi8> to vector<2x[4]xi8> + %r = arith.extsi %b : vector<2x[4]xi8> to vector<2x[4]xi32> + return %r : vector<2x[4]xi32> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_same_type // CHECK-SAME: (%[[A:.+]]: vector<4x2xf32>, %[[B:.+]]: vector<4x2xf32>) // CHECK: %[[ADD:.+]] = arith.addf %[[A]], %[[B]] : vector<4x2xf32> @@ -265,6 +295,21 @@ func.func @transpose_elementwise_same_type(%a : vector<4x2xf32>, %b : vector<4x2 // ----- +// CHECK-LABEL: func @transpose_elementwise_same_type_scalable +// CHECK-SAME: (%[[A:.+]]: vector<[4]x2xf32>, %[[B:.+]]: vector<[4]x2xf32>) +// CHECK: %[[ADD:.+]] = arith.addf %[[A]], %[[B]] : vector<[4]x2xf32> +// CHECK: %[[T:.+]] = vector.transpose %[[ADD]], [1, 0] +// CHECK: return %[[T]] + +func.func @transpose_elementwise_same_type_scalable(%a : vector<[4]x2xf32>, %b : vector<[4]x2xf32>) -> vector<2x[4]xf32> { + %at = vector.transpose %a, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %bt = vector.transpose %b, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %r = arith.addf %at, %bt : vector<2x[4]xf32> + return %r : vector<2x[4]xf32> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_diff_operand_types // CHECK-SAME: (%[[COND:.+]]: vector<4x2xi1>, %[[A:.+]]: vector<4x2xf32>, %[[B:.+]]: vector<4x2xf32>) // CHECK: %[[S:.+]] = arith.select %[[COND]], %[[A]], %[[B]] : vector<4x2xi1>, vector<4x2xf32> @@ -280,6 +325,21 @@ func.func @transpose_elementwise_diff_operand_types(%cond: vector<4x2xi1>, %a : // ----- +// CHECK-LABEL: func @transpose_elementwise_diff_operand_types_scalable +// CHECK-SAME: (%[[COND:.+]]: vector<[4]x2xi1>, %[[A:.+]]: vector<[4]x2xf32>, %[[B:.+]]: vector<[4]x2xf32>) +// CHECK: %[[S:.+]] = arith.select %[[COND]], %[[A]], %[[B]] : vector<[4]x2xi1>, vector<[4]x2xf32> +// CHECK: %[[T:.+]] = vector.transpose %[[S]], [1, 0] : vector<[4]x2xf32> to vector<2x[4]xf32> +// CHECK: return %[[T]] +func.func @transpose_elementwise_diff_operand_types_scalable(%cond: vector<[4]x2xi1>, %a : vector<[4]x2xf32>, %b : vector<[4]x2xf32>) -> vector<2x[4]xf32> { + %condt = vector.transpose %cond, [1, 0]: vector<[4]x2xi1> to vector<2x[4]xi1> + %at = vector.transpose %a, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %bt = vector.transpose %b, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %r = arith.select %condt, %at, %bt : vector<2x[4]xi1>, vector<2x[4]xf32> + return %r : vector<2x[4]xf32> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_diff_operand_result_type // CHECK-SAME: (%[[A:.+]]: vector<4x2xf32>, %[[B:.+]]: vector<4x2xf32>) // CHECK: %[[CMP:.+]] = arith.cmpf olt, %[[A]], %[[B]] : vector<4x2xf32> @@ -294,6 +354,20 @@ func.func @transpose_elementwise_diff_operand_result_type(%a : vector<4x2xf32>, // ----- +// CHECK-LABEL: func @transpose_elementwise_diff_operand_result_type_scalable +// CHECK-SAME: (%[[A:.+]]: vector<[4]x2xf32>, %[[B:.+]]: vector<[4]x2xf32>) +// CHECK: %[[CMP:.+]] = arith.cmpf olt, %[[A]], %[[B]] : vector<[4]x2xf32> +// CHECK: %[[T:.+]] = vector.transpose %[[CMP]], [1, 0] : vector<[4]x2xi1> to vector<2x[4]xi1> +// CHECK: return %[[T]] +func.func @transpose_elementwise_diff_operand_result_type_scalable(%a : vector<[4]x2xf32>, %b : vector<[4]x2xf32>) -> vector<2x[4]xi1> { + %at = vector.transpose %a, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %bt = vector.transpose %b, [1, 0]: vector<[4]x2xf32> to vector<2x[4]xf32> + %r = arith.cmpf olt, %at, %bt : vector<2x[4]xf32> + return %r : vector<2x[4]xi1> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_splat_constant // CHECK-SAME: (%[[A:.+]]: vector<4x6x3x2xf32>) // CHECK: %[[B:.+]] = arith.constant dense<5.000000e+00> : vector<4x6x3x2xf32> @@ -310,6 +384,22 @@ func.func @transpose_elementwise_splat_constant(%a : vector<4x6x3x2xf32>) -> vec // ----- +// CHECK-LABEL: func @transpose_elementwise_splat_constant_scalable +// CHECK-SAME: (%[[A:.+]]: vector<[4]x6x3x2xf32>) +// CHECK: %[[B:.+]] = arith.constant dense<5.000000e+00> : vector<[4]x6x3x2xf32> +// CHECK: %[[ADD:.+]] = arith.addf %[[A]], %[[B]] : vector<[4]x6x3x2xf32> +// CHECK: %[[T:.+]] = vector.transpose %[[ADD]], [1, 0, 3, 2] : vector<[4]x6x3x2xf32> to vector<6x[4]x2x3xf32> +// CHECK: return %[[T:.+]] : vector<6x[4]x2x3xf32> + +func.func @transpose_elementwise_splat_constant_scalable(%a : vector<[4]x6x3x2xf32>) -> vector<6x[4]x2x3xf32> { + %b = arith.constant dense<5.0> : vector<6x[4]x2x3xf32> + %at = vector.transpose %a, [1, 0, 3, 2]: vector<[4]x6x3x2xf32> to vector<6x[4]x2x3xf32> + %r = arith.addf %at, %b : vector<6x[4]x2x3xf32> + return %r : vector<6x[4]x2x3xf32> +} + +// ----- + // CHECK-LABEL: func @transpose_elementwise_diff_map // CHECK: vector.transpose // CHECK: vector.transpose @@ -320,3 +410,16 @@ func.func @transpose_elementwise_diff_map(%a : vector<4x6x3x2xf32>, %b: vector<6 %r = arith.addf %at, %bt : vector<6x4x2x3xf32> return %r : vector<6x4x2x3xf32> } + +// ----- + +// CHECK-LABEL: func @transpose_elementwise_diff_map_scalable +// CHECK: vector.transpose +// CHECK: vector.transpose +// CHECK: arith.addf +func.func @transpose_elementwise_diff_map_scalable(%a : vector<[4]x6x3x2xf32>, %b: vector<6x2x[4]x3xf32>) -> vector<6x[4]x2x3xf32> { + %at = vector.transpose %a, [1, 0, 3, 2]: vector<[4]x6x3x2xf32> to vector<6x[4]x2x3xf32> + %bt = vector.transpose %b, [0, 2, 1, 3]: vector<6x2x[4]x3xf32> to vector<6x[4]x2x3xf32> + %r = arith.addf %at, %bt : vector<6x[4]x2x3xf32> + return %r : vector<6x[4]x2x3xf32> +} diff --git a/mlir/test/Target/LLVMIR/omptarget-if.mlir b/mlir/test/Target/LLVMIR/omptarget-if.mlir new file mode 100644 index 0000000000000..706ad4411438b --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-if.mlir @@ -0,0 +1,68 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-amd-amdhsa"]} { + llvm.func @target_if_variable(%x : i1) { + omp.target if(%x) { + omp.terminator + } + llvm.return + } + + // CHECK-LABEL: define void @target_if_variable( + // CHECK-SAME: i1 %[[IF_COND:.*]]) + // CHECK: br i1 %[[IF_COND]], label %[[THEN_LABEL:.*]], label %[[ELSE_LABEL:.*]] + + // CHECK: [[THEN_LABEL]]: + // CHECK-NOT: {{^.*}}: + // CHECK: %[[RC:.*]] = call i32 @__tgt_target_kernel + // CHECK-NEXT: %[[OFFLOAD_SUCCESS:.*]] = icmp ne i32 %[[RC]], 0 + // CHECK-NEXT: br i1 %[[OFFLOAD_SUCCESS]], label %[[OFFLOAD_FAIL_LABEL:.*]], label %[[OFFLOAD_CONT_LABEL:.*]] + + // CHECK: [[OFFLOAD_FAIL_LABEL]]: + // CHECK-NEXT: call void @[[FALLBACK_FN:__omp_offloading_.*_.*_target_if_variable_l.*]]() + // CHECK-NEXT: br label %[[OFFLOAD_CONT_LABEL]] + + // CHECK: [[OFFLOAD_CONT_LABEL]]: + // CHECK-NEXT: br label %[[END_LABEL:.*]] + + // CHECK: [[ELSE_LABEL]]: + // CHECK-NEXT: call void @[[FALLBACK_FN]]() + // CHECK-NEXT: br label %[[END_LABEL]] + + llvm.func @target_if_true() { + %0 = llvm.mlir.constant(true) : i1 + omp.target if(%0) { + omp.terminator + } + llvm.return + } + + // CHECK-LABEL: define void @target_if_true() + // CHECK-NOT: {{^.*}}: + // CHECK: br label %[[ENTRY:.*]] + + // CHECK: [[ENTRY]]: + // CHECK-NOT: {{^.*}}: + // CHECK: %[[RC:.*]] = call i32 @__tgt_target_kernel + // CHECK-NEXT: %[[OFFLOAD_SUCCESS:.*]] = icmp ne i32 %[[RC]], 0 + // CHECK-NEXT: br i1 %[[OFFLOAD_SUCCESS]], label %[[OFFLOAD_FAIL_LABEL:.*]], label %[[OFFLOAD_CONT_LABEL:.*]] + + // CHECK: [[OFFLOAD_FAIL_LABEL]]: + // CHECK-NEXT: call void @[[FALLBACK_FN:.*]]() + // CHECK-NEXT: br label %[[OFFLOAD_CONT_LABEL]] + + llvm.func @target_if_false() { + %0 = llvm.mlir.constant(false) : i1 + omp.target if(%0) { + omp.terminator + } + llvm.return + } + + // CHECK-LABEL: define void @target_if_false() + // CHECK-NEXT: br label %[[ENTRY:.*]] + + // CHECK: [[ENTRY]]: + // CHECK-NEXT: call void @__omp_offloading_{{.*}}_{{.*}}_target_if_false_l{{.*}}() +} + diff --git a/mlir/test/Target/LLVMIR/openmp-target-simd-on_device.mlir b/mlir/test/Target/LLVMIR/openmp-target-simd-on_device.mlir new file mode 100644 index 0000000000000..0ce90578ea9d6 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-target-simd-on_device.mlir @@ -0,0 +1,34 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {omp.is_target_device = true} { + omp.private {type = private} @simd_privatizer : !llvm.ptr alloc { + ^bb0(%arg0: !llvm.ptr): + omp.yield(%arg0 : !llvm.ptr) + } + + llvm.func @test_target_simd() { + omp.target { + %5 = llvm.mlir.constant(1 : i32) : i32 + %x = llvm.alloca %5 x i32 {bindc_name = "x"} : (i32) -> !llvm.ptr + omp.simd private(@simd_privatizer %x -> %arg1 : !llvm.ptr) { + omp.loop_nest (%arg2) : i32 = (%5) to (%5) inclusive step (%5) { + omp.yield + } + } + omp.terminator + } + llvm.return + } + +} + +// CHECK-LABEL: define {{.*}} @__omp_offloading_{{.*}}_test_target_simd_{{.*}} + +// CHECK: %[[INT:.*]] = alloca i32, align 4 +// CHECK: br label %[[LATE_ALLOC_BB:.*]] + +// CHECK: [[LATE_ALLOC_BB]]: +// CHECK: br label %[[AFTER_ALLOC_BB:.*]] + +// CHECK: [[AFTER_ALLOC_BB]]: +// CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}} diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index 392a6558dcfa6..c1e30964b2507 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -271,17 +271,6 @@ llvm.func @target_host_eval(%x : i32) { // ----- -llvm.func @target_if(%x : i1) { - // expected-error@below {{not yet implemented: Unhandled clause if in omp.target operation}} - // expected-error@below {{LLVM Translation failed for operation: omp.target}} - omp.target if(%x) { - omp.terminator - } - llvm.return -} - -// ----- - omp.declare_reduction @add_f32 : f32 init { ^bb0(%arg: f32): diff --git a/mlir/unittests/IR/InterfaceAttachmentTest.cpp b/mlir/unittests/IR/InterfaceAttachmentTest.cpp index b6066dd5685dc..1b5d3b8c31bd2 100644 --- a/mlir/unittests/IR/InterfaceAttachmentTest.cpp +++ b/mlir/unittests/IR/InterfaceAttachmentTest.cpp @@ -43,7 +43,7 @@ struct Model /// overrides default methods. struct OverridingModel : public TestExternalTypeInterface::ExternalModel { + Float32Type> { unsigned getBitwidthPlusArg(Type type, unsigned arg) const { return type.getIntOrFloatBitWidth() + arg; } diff --git a/mlir/utils/vim/ftplugin/mlir.vim b/mlir/utils/vim/ftplugin/mlir.vim index 83b07f51001c0..2e3845cdddc32 100644 --- a/mlir/utils/vim/ftplugin/mlir.vim +++ b/mlir/utils/vim/ftplugin/mlir.vim @@ -10,3 +10,12 @@ let b:did_ftplugin = 1 setlocal softtabstop=2 shiftwidth=2 setlocal expandtab setlocal comments+=:// +setlocal commentstring=//\ %s +" We treat sequences of the following characters as forming 'keywords', with +" the aim of easing movement around MLIR identifiers: +" * identifier prefixes: '%' and '@' (@-@) +" * all characters where isalpha() returns TRUE (@) +" * the digits 0-9 (48-57) +" * other characters that may form identifiers: '_', '.', '-', '$' +" Comment this out to restore the default behaviour +setlocal iskeyword=%,@-@,@,48-57,_,.,-,$ diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 9d33e94e4432b..47e632098a41b 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -2168,6 +2168,7 @@ cc_library( ":sema", ":serialization_attr_gen", ":static_analyzer_core_options", + ":support", ":type_nodes_gen", "//llvm:BitReader", "//llvm:BitWriter",