diff --git a/.clang-tidy b/.clang-tidy index 06bb0f18e9d2e..2cda1b81de808 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,3 +1,4 @@ +HeaderFilterRegex: '' Checks: > -*, clang-diagnostic-*, diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt index fb3f05329be21..d7ec853af862f 100644 --- a/clang-tools-extra/clangd/CMakeLists.txt +++ b/clang-tools-extra/clangd/CMakeLists.txt @@ -165,6 +165,7 @@ clang_target_link_libraries(clangDaemon clangBasic clangDependencyScanning clangDriver + clangOptions clangFormat clangFrontend clangIndex diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp index c1be93730129a..7990f2719e9a0 100644 --- a/clang-tools-extra/clangd/CompileCommands.cpp +++ b/clang-tools-extra/clangd/CompileCommands.cpp @@ -11,8 +11,8 @@ #include "support/Logger.h" #include "support/Trace.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInvocation.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/ADT/ArrayRef.h" @@ -206,7 +206,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, if (Cmd.empty()) return; - auto &OptTable = clang::driver::getDriverOptTable(); + auto &OptTable = getDriverOptTable(); // OriginalArgs needs to outlive ArgList. llvm::SmallVector OriginalArgs; OriginalArgs.reserve(Cmd.size()); @@ -222,8 +222,8 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, llvm::opt::InputArgList ArgList; ArgList = OptTable.ParseArgs( llvm::ArrayRef(OriginalArgs).drop_front(), IgnoredCount, IgnoredCount, - llvm::opt::Visibility(IsCLMode ? driver::options::CLOption - : driver::options::ClangOption)); + llvm::opt::Visibility(IsCLMode ? options::CLOption + : options::ClangOption)); llvm::SmallVector IndicesToDrop; // Having multiple architecture options (e.g. when building fat binaries) @@ -232,7 +232,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, // As there are no signals to figure out which one user actually wants. They // can explicitly specify one through `CompileFlags.Add` if need be. unsigned ArchOptCount = 0; - for (auto *Input : ArgList.filtered(driver::options::OPT_arch)) { + for (auto *Input : ArgList.filtered(options::OPT_arch)) { ++ArchOptCount; for (auto I = 0U; I <= Input->getNumValues(); ++I) IndicesToDrop.push_back(Input->getIndex() + I); @@ -262,13 +262,12 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, // explicitly at the end of the flags. This ensures modifications done in the // following steps apply in more cases (like setting -x, which only affects // inputs that come after it). - for (auto *Input : ArgList.filtered(driver::options::OPT_INPUT)) { + for (auto *Input : ArgList.filtered(options::OPT_INPUT)) { SawInput(Input->getValue(0)); IndicesToDrop.push_back(Input->getIndex()); } // Anything after `--` is also treated as input, drop them as well. - if (auto *DashDash = - ArgList.getLastArgNoClaim(driver::options::OPT__DASH_DASH)) { + if (auto *DashDash = ArgList.getLastArgNoClaim(options::OPT__DASH_DASH)) { auto DashDashIndex = DashDash->getIndex() + 1; // +1 accounts for Cmd[0] // Another +1 so we don't treat the `--` itself as an input. for (unsigned I = DashDashIndex + 1; I < Cmd.size(); ++I) @@ -424,11 +423,11 @@ DriverMode getDriverMode(const std::vector &Args) { // Returns the set of DriverModes where an option may be used. unsigned char getModes(const llvm::opt::Option &Opt) { unsigned char Result = DM_None; - if (Opt.hasVisibilityFlag(driver::options::ClangOption)) + if (Opt.hasVisibilityFlag(options::ClangOption)) Result |= DM_GCC; - if (Opt.hasVisibilityFlag(driver::options::CC1Option)) + if (Opt.hasVisibilityFlag(options::CC1Option)) Result |= DM_CC1; - if (Opt.hasVisibilityFlag(driver::options::CLOption)) + if (Opt.hasVisibilityFlag(options::CLOption)) Result |= DM_CL; return Result; } @@ -442,8 +441,8 @@ llvm::ArrayRef ArgStripper::rulesFor(llvm::StringRef Arg) { using TableTy = llvm::StringMap, llvm::BumpPtrAllocator>; static TableTy *Table = [] { - auto &DriverTable = driver::getDriverOptTable(); - using DriverID = clang::driver::options::ID; + auto &DriverTable = getDriverOptTable(); + using DriverID = clang::options::ID; // Collect sets of aliases, so we can treat -foo and -foo= as synonyms. // Conceptually a double-linked list: PrevAlias[I] -> I -> NextAlias[I]. @@ -468,7 +467,7 @@ llvm::ArrayRef ArgStripper::rulesFor(llvm::StringRef Arg) { FLAGS, VISIBILITY, PARAM, HELPTEXT, HELPTEXTSFORVARIANTS, \ METAVAR, VALUES, SUBCOMMANDIDS_OFFSET) \ {DriverID::OPT_##ID, DriverID::OPT_##ALIAS, ALIASARGS}, -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTION }; for (auto &E : AliasTable) diff --git a/clang-tools-extra/modularize/CMakeLists.txt b/clang-tools-extra/modularize/CMakeLists.txt index eb5383c3ad44e..a775b790a3147 100644 --- a/clang-tools-extra/modularize/CMakeLists.txt +++ b/clang-tools-extra/modularize/CMakeLists.txt @@ -20,6 +20,7 @@ clang_target_link_libraries(modularize clangAST clangBasic clangDriver + clangOptions clangFrontend clangLex clangSerialization diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp index 1345a6ef8f489..d80d78c64c6e2 100644 --- a/clang-tools-extra/modularize/CoverageChecker.cpp +++ b/clang-tools-extra/modularize/CoverageChecker.cpp @@ -50,18 +50,18 @@ // //===----------------------------------------------------------------------===// +#include "CoverageChecker.h" #include "ModularizeUtilities.h" #include "clang/AST/ASTConsumer.h" -#include "CoverageChecker.h" #include "clang/AST/ASTContext.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/PPCallbacks.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Option.h" @@ -73,7 +73,7 @@ using namespace Modularize; using namespace clang; using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace clang::tooling; namespace cl = llvm::cl; namespace sys = llvm::sys; diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp index 376ad0c7875bf..33966b44f719a 100644 --- a/clang-tools-extra/modularize/Modularize.cpp +++ b/clang-tools-extra/modularize/Modularize.cpp @@ -231,11 +231,11 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Arg.h" @@ -254,7 +254,7 @@ using namespace clang; using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace clang::tooling; using namespace llvm; using namespace llvm::opt; diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp index 4dd84feac5df4..6978a6b2fe1b7 100644 --- a/clang-tools-extra/modularize/ModularizeUtilities.cpp +++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp @@ -12,17 +12,17 @@ // //===----------------------------------------------------------------------===// +#include "ModularizeUtilities.h" +#include "CoverageChecker.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendActions.h" -#include "CoverageChecker.h" +#include "clang/Options/Options.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" -#include "ModularizeUtilities.h" using namespace clang; using namespace llvm; diff --git a/clang-tools-extra/pp-trace/CMakeLists.txt b/clang-tools-extra/pp-trace/CMakeLists.txt index 1323adbc35269..da36582ee0234 100644 --- a/clang-tools-extra/pp-trace/CMakeLists.txt +++ b/clang-tools-extra/pp-trace/CMakeLists.txt @@ -14,6 +14,7 @@ clang_target_link_libraries(pp-trace PRIVATE clangAST clangBasic + clangOptions clangFrontend clangLex clangSerialization diff --git a/clang-tools-extra/pp-trace/PPTrace.cpp b/clang-tools-extra/pp-trace/PPTrace.cpp index 0b078c49a55b7..ba5a06a26830d 100644 --- a/clang-tools-extra/pp-trace/PPTrace.cpp +++ b/clang-tools-extra/pp-trace/PPTrace.cpp @@ -28,11 +28,11 @@ #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Options/Options.h" #include "clang/Tooling/Execution.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Arg.h" diff --git a/clang/docs/CMakeLists.txt b/clang/docs/CMakeLists.txt index 1f06c040c96cb..9469a832adb62 100644 --- a/clang/docs/CMakeLists.txt +++ b/clang/docs/CMakeLists.txt @@ -132,7 +132,7 @@ if (LLVM_ENABLE_SPHINX) # Generated files gen_rst_file_from_td(AttributeReference.rst -gen-attr-docs ../include/clang/Basic/Attr.td "${docs_targets}") gen_rst_file_from_td(DiagnosticsReference.rst -gen-diag-docs ../include/clang/Basic/Diagnostic.td "${docs_targets}") - gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Driver/ClangOptionDocs.td "${docs_targets}") + gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Options/ClangOptionDocs.td "${docs_targets}") # Another generated file from a different source set(docs_tools_dir ${CMAKE_CURRENT_SOURCE_DIR}/tools) diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst index eff46ab46e1ca..a849d05eb7ae9 100644 --- a/clang/docs/InternalsManual.rst +++ b/clang/docs/InternalsManual.rst @@ -667,7 +667,7 @@ Command Line Interface ---------------------- The command line interface of the Clang ``-cc1`` frontend is defined alongside -the driver options in ``clang/Driver/Options.td``. The information making up an +the driver options in ``clang/Options/Options.td``. The information making up an option definition includes its prefix and name (for example ``-std=``), form and position of the option value, help text, aliases and more. Each option may belong to a certain group and can be marked with zero or more flags. Options @@ -712,7 +712,7 @@ variable for the option value: } Next, declare the command line interface of the option in the tablegen file -``clang/include/clang/Driver/Options.td``. This is done by instantiating the +``clang/include/clang/Options/Options.td``. This is done by instantiating the ``Option`` class (defined in ``llvm/include/llvm/Option/OptParser.td``). The instance is typically created through one of the helper classes that encode the acceptable ways to specify the option value on the command line: @@ -906,7 +906,7 @@ command line: SHOULD_PARSE, KEYPATH, DEFAULT_VALUE, \ IMPLIED_CHECK, IMPLIED_VALUE, NORMALIZER, \ MERGER, TABLE_INDEX) - #include "clang/Driver/Options.inc" + #include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // ... @@ -925,7 +925,7 @@ command line: GENERATE_OPTION_WITH_MARSHALLING( \ Args, SA, KIND, FLAGS, SPELLING, ALWAYS_EMIT, KEYPATH, DEFAULT_VALUE, \ IMPLIED_CHECK, IMPLIED_VALUE, DENORMALIZER, EXTRACTOR, TABLE_INDEX) - #include "clang/Driver/Options.inc" + #include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // ... diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5f356daec2d04..9d3ac48d2e0ca 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -79,6 +79,9 @@ Potentially Breaking Changes void foo(void) { return ({ 1;; }); } +- Downstream projects that previously linked only against ``clangDriver`` may + now (also) need to link against the new ``clangOptions`` library, since + options-related code has been moved out of the Driver into a separate library. C/C++ Language Potentially Breaking Changes ------------------------------------------- diff --git a/clang/include/clang/CMakeLists.txt b/clang/include/clang/CMakeLists.txt index 47ac70cd21690..77a44e4c48de5 100644 --- a/clang/include/clang/CMakeLists.txt +++ b/clang/include/clang/CMakeLists.txt @@ -3,7 +3,7 @@ add_subdirectory(Basic) if(CLANG_ENABLE_CIR) add_subdirectory(CIR) endif() -add_subdirectory(Driver) +add_subdirectory(Options) add_subdirectory(Parse) add_subdirectory(Sema) add_subdirectory(Serialization) diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index 419089731569b..297afe26812a0 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -15,11 +15,11 @@ #include "clang/Driver/Action.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Phases.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Types.h" #include "clang/Driver/Util.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/StringMap.h" diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h index 49fd920d1ec43..ed2703c76f18d 100644 --- a/clang/include/clang/Frontend/Utils.h +++ b/clang/include/clang/Frontend/Utils.h @@ -15,8 +15,8 @@ #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LLVM.h" -#include "clang/Driver/OptionUtils.h" #include "clang/Frontend/DependencyOutputOptions.h" +#include "clang/Options/OptionUtils.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/StringMap.h" diff --git a/clang/include/clang/Driver/CMakeLists.txt b/clang/include/clang/Options/CMakeLists.txt similarity index 100% rename from clang/include/clang/Driver/CMakeLists.txt rename to clang/include/clang/Options/CMakeLists.txt diff --git a/clang/include/clang/Driver/ClangOptionDocs.td b/clang/include/clang/Options/ClangOptionDocs.td similarity index 100% rename from clang/include/clang/Driver/ClangOptionDocs.td rename to clang/include/clang/Options/ClangOptionDocs.td diff --git a/clang/include/clang/Driver/OptionUtils.h b/clang/include/clang/Options/OptionUtils.h similarity index 94% rename from clang/include/clang/Driver/OptionUtils.h rename to clang/include/clang/Options/OptionUtils.h index 922f536bf33ea..83c48bd7d6843 100644 --- a/clang/include/clang/Driver/OptionUtils.h +++ b/clang/include/clang/Options/OptionUtils.h @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_DRIVER_OPTIONUTILS_H -#define LLVM_CLANG_DRIVER_OPTIONUTILS_H +#ifndef LLVM_CLANG_OPTIONS_OPTIONUTILS_H +#define LLVM_CLANG_OPTIONS_OPTIONUTILS_H #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LLVM.h" @@ -55,4 +55,4 @@ inline uint64_t getLastArgUInt64Value(const llvm::opt::ArgList &Args, } // namespace clang -#endif // LLVM_CLANG_DRIVER_OPTIONUTILS_H +#endif // LLVM_CLANG_OPTIONS_OPTIONUTILS_H diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Options/Options.h similarity index 83% rename from clang/include/clang/Driver/Options.h rename to clang/include/clang/Options/Options.h index 0797410e9940e..ac98699001965 100644 --- a/clang/include/clang/Driver/Options.h +++ b/clang/include/clang/Options/Options.h @@ -6,14 +6,13 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_DRIVER_OPTIONS_H -#define LLVM_CLANG_DRIVER_OPTIONS_H +#ifndef LLVM_CLANG_OPTIONS_OPTIONS_H +#define LLVM_CLANG_OPTIONS_OPTIONS_H #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" namespace clang { -namespace driver { namespace options { /// Flags specifically for clang options. Must not overlap with @@ -42,16 +41,15 @@ enum ClangVisibility { }; enum ID { - OPT_INVALID = 0, // This is not an option ID. + OPT_INVALID = 0, // This is not an option ID. #define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__), -#include "clang/Driver/Options.inc" - LastOption +#include "clang/Options/Options.inc" + LastOption #undef OPTION - }; -} +}; +} // namespace options const llvm::opt::OptTable &getDriverOptTable(); -} -} +} // namespace clang -#endif +#endif // LLVM_CLANG_OPTIONS_OPTIONS_H diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Options/Options.td similarity index 100% rename from clang/include/clang/Driver/Options.td rename to clang/include/clang/Options/Options.td diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap index c5535262ae38c..a11c8683c601e 100644 --- a/clang/include/module.modulemap +++ b/clang/include/module.modulemap @@ -146,6 +146,7 @@ module Clang_Lex { module * { export * } } +module Clang_Options { requires cplusplus umbrella "clang/Options" module * { export * } } module Clang_Parse { requires cplusplus umbrella "clang/Parse" module * { export * } } module Clang_Rewrite { requires cplusplus umbrella "clang/Rewrite/Core" module * { export * } } module Clang_RewriteFrontend { requires cplusplus umbrella "clang/Rewrite/Frontend" module * { export * } } diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index 2dec26ecacf26..5e9da245e2b43 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -534,7 +534,7 @@ WarningsSpecialCaseList::create(const llvm::MemoryBuffer &Input, void WarningsSpecialCaseList::processSections(DiagnosticsEngine &Diags) { static constexpr auto WarningFlavor = clang::diag::Flavor::WarningOrError; for (const auto &SectionEntry : sections()) { - StringRef DiagGroup = SectionEntry.SectionStr; + StringRef DiagGroup = SectionEntry.name(); if (DiagGroup == "*") { // Drop the default section introduced by special case list, we only // support exact diagnostic group names. diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp index 9cb118893a0d9..8727057eb78d1 100644 --- a/clang/lib/Basic/ProfileList.cpp +++ b/clang/lib/Basic/ProfileList.cpp @@ -36,7 +36,7 @@ class ProfileSpecialCaseList : public llvm::SpecialCaseList { bool hasPrefix(StringRef Prefix) const { for (const auto &It : sections()) - if (It.Entries.count(Prefix) > 0) + if (It.hasPrefix(Prefix)) return true; return false; } diff --git a/clang/lib/Basic/SanitizerSpecialCaseList.cpp b/clang/lib/Basic/SanitizerSpecialCaseList.cpp index 56f551628cf89..928c086898097 100644 --- a/clang/lib/Basic/SanitizerSpecialCaseList.cpp +++ b/clang/lib/Basic/SanitizerSpecialCaseList.cpp @@ -42,7 +42,7 @@ void SanitizerSpecialCaseList::createSanitizerSections() { SanitizerMask Mask; #define SANITIZER(NAME, ID) \ - if (S.SectionMatcher.matchAny(NAME)) \ + if (S.matchName(NAME)) \ Mask |= SanitizerKind::ID; #define SANITIZER_GROUP(NAME, ID, ALIAS) SANITIZER(NAME, ID) @@ -68,7 +68,7 @@ SanitizerSpecialCaseList::inSectionBlame(SanitizerMask Mask, StringRef Prefix, if (S.Mask & Mask) { unsigned LineNum = S.S.getLastMatch(Prefix, Query, Category); if (LineNum > 0) - return {S.S.FileIdx, LineNum}; + return {S.S.fileIndex(), LineNum}; } } return NotFound; diff --git a/clang/lib/CMakeLists.txt b/clang/lib/CMakeLists.txt index 4f2218b583e41..e90b009da606a 100644 --- a/clang/lib/CMakeLists.txt +++ b/clang/lib/CMakeLists.txt @@ -13,6 +13,7 @@ add_subdirectory(Edit) add_subdirectory(ExtractAPI) add_subdirectory(Rewrite) add_subdirectory(Driver) +add_subdirectory(Options) add_subdirectory(Serialization) add_subdirectory(Frontend) add_subdirectory(FrontendTool) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 68a83561d7cb1..b5d1afdb11805 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -2024,22 +2024,29 @@ void CGOpenMPRuntime::emitCriticalRegion(CodeGenFunction &CGF, // Prepare arguments and build a call to __kmpc_critical if (!CGF.HaveInsertPoint()) return; + llvm::FunctionCallee RuntimeFcn = OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), + Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical); + llvm::Value *LockVar = getCriticalRegionLock(CriticalName); + unsigned LockVarArgIdx = 2; + if (cast(LockVar)->getAddressSpace() != + RuntimeFcn.getFunctionType() + ->getParamType(LockVarArgIdx) + ->getPointerAddressSpace()) + LockVar = CGF.Builder.CreateAddrSpaceCast( + LockVar, RuntimeFcn.getFunctionType()->getParamType(LockVarArgIdx)); llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), - getCriticalRegionLock(CriticalName)}; + LockVar}; llvm::SmallVector EnterArgs(std::begin(Args), std::end(Args)); if (Hint) { EnterArgs.push_back(CGF.Builder.CreateIntCast( CGF.EmitScalarExpr(Hint), CGM.Int32Ty, /*isSigned=*/false)); } - CommonActionTy Action( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), - Hint ? OMPRTL___kmpc_critical_with_hint : OMPRTL___kmpc_critical), - EnterArgs, - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_end_critical), - Args); + CommonActionTy Action(RuntimeFcn, EnterArgs, + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_end_critical), + Args); CriticalOpGen.setAction(Action); emitInlinedDirective(CGF, OMPD_critical, CriticalOpGen); } diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt index a8d83d38264a7..8577b3af257a7 100644 --- a/clang/lib/Driver/CMakeLists.txt +++ b/clang/lib/Driver/CMakeLists.txt @@ -20,12 +20,10 @@ add_clang_library(clangDriver Compilation.cpp Distro.cpp Driver.cpp - DriverOptions.cpp Job.cpp Multilib.cpp MultilibBuilder.cpp OffloadBundler.cpp - OptionUtils.cpp Phases.cpp SanitizerArgs.cpp Tool.cpp @@ -101,6 +99,7 @@ add_clang_library(clangDriver LINK_LIBS clangBasic clangLex + clangOptions ${system_libs} ${LLVM_PTHREAD_LIB} ) diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp index 665d81f99ba45..4a75ae2c74c51 100644 --- a/clang/lib/Driver/Compilation.cpp +++ b/clang/lib/Driver/Compilation.cpp @@ -11,9 +11,9 @@ #include "clang/Driver/Action.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/OptSpecifier.h" #include "llvm/Option/Option.h" diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 89c4b7363f21c..12645ad58413f 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -60,8 +60,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" -#include "clang/Driver/OptionUtils.h" +#include "clang/Options/Options.h" #include "clang/Driver/Phases.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/Tool.h" @@ -69,6 +68,7 @@ #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" #include "clang/Lex/DependencyDirectivesScanner.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" @@ -113,6 +113,7 @@ using namespace clang::driver; using namespace clang; using namespace llvm::opt; +using namespace clang::options; template static bool usesInput(const ArgList &Args, F &&Fn) { return llvm::any_of(Args, [&](Arg *A) { @@ -1704,10 +1705,11 @@ Compilation *Driver::BuildCompilation(ArrayRef ArgList) { // Force -parallel-jobs=1 when verbose is set to avoid corrupted output if (Args.hasArg(options::OPT_v)) setNumberOfParallelJobs(1); +#if FIXME else setNumberOfParallelJobs( - getLastArgIntValue(Args, options::OPT_parallel_jobs_EQ, 1, Diags)); - + getLastArgIntValue(Args, clang::options::OPT_parallel_jobs_EQ, 1, Diags)); +#endif // Remove existing compilation database so that each job can append to it. if (Arg *A = Args.getLastArg(options::OPT_MJ)) llvm::sys::fs::remove(A->getValue()); diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 5dd48f53b9069..420c4cddbc8dd 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -8,8 +8,8 @@ #include "clang/Driver/SanitizerArgs.h" #include "clang/Basic/Sanitizers.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 3452ce5339174..5ee53c01d6e30 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -21,9 +21,9 @@ #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/XRayArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" @@ -338,7 +338,7 @@ static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple, Multilib::flags_list ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const { - using namespace clang::driver::options; + using namespace clang::options; std::vector Result; const llvm::Triple Triple(ComputeEffectiveClangTriple(Args)); @@ -1816,7 +1816,7 @@ void ToolChain::TranslateXarchArgs( unsigned Index = BaseArgs.MakeIndex(A->getValue(ValuePos)); unsigned Prev = Index; std::unique_ptr XarchArg(Opts.ParseOneArg( - Args, Index, llvm::opt::Visibility(clang::driver::options::ClangOption))); + Args, Index, llvm::opt::Visibility(options::ClangOption))); // If the argument parsing failed or more than one argument was // consumed, the -Xarch_ argument's parameter tried to consume diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp index ffd7b69205440..21e7dcad3cdd0 100644 --- a/clang/lib/Driver/ToolChains/AIX.cpp +++ b/clang/lib/Driver/ToolChains/AIX.cpp @@ -9,8 +9,8 @@ #include "AIX.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" @@ -19,6 +19,7 @@ #include using AIX = clang::driver::toolchains::AIX; +using namespace clang; using namespace clang::driver; using namespace clang::driver::tools; using namespace clang::driver::toolchains; @@ -167,8 +168,7 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA, Args.hasArg(options::OPT_coverage)) CmdArgs.push_back("-bdbg:namedsects:ss"); - if (Arg *A = - Args.getLastArg(clang::driver::options::OPT_mxcoff_build_id_EQ)) { + if (Arg *A = Args.getLastArg(options::OPT_mxcoff_build_id_EQ)) { StringRef BuildId = A->getValue(); if (BuildId[0] != '0' || BuildId[1] != 'x' || BuildId.find_if_not(llvm::isHexDigit, 2) != StringRef::npos) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index edfc62f0ad7d5..3b5411179349b 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Error.h" @@ -324,27 +324,24 @@ RocmInstallationDetector::RocmInstallationDetector( const llvm::opt::ArgList &Args, bool DetectHIPRuntime, bool DetectDeviceLib) : D(D) { Verbose = Args.hasArg(options::OPT_v); - RocmPathArg = Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ); - PrintROCmSearchDirs = - Args.hasArg(clang::driver::options::OPT_print_rocm_search_dirs); + RocmPathArg = Args.getLastArgValue(options::OPT_rocm_path_EQ); + PrintROCmSearchDirs = Args.hasArg(options::OPT_print_rocm_search_dirs); RocmDeviceLibPathArg = - Args.getAllArgValues(clang::driver::options::OPT_rocm_device_lib_path_EQ); - HIPPathArg = Args.getLastArgValue(clang::driver::options::OPT_hip_path_EQ); - HIPStdParPathArg = - Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_path_EQ); + Args.getAllArgValues(options::OPT_rocm_device_lib_path_EQ); + HIPPathArg = Args.getLastArgValue(options::OPT_hip_path_EQ); + HIPStdParPathArg = Args.getLastArgValue(options::OPT_hipstdpar_path_EQ); HasHIPStdParLibrary = !HIPStdParPathArg.empty() && D.getVFS().exists(HIPStdParPathArg + "/hipstdpar_lib.hpp"); HIPRocThrustPathArg = - Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_thrust_path_EQ); + Args.getLastArgValue(options::OPT_hipstdpar_thrust_path_EQ); HasRocThrustLibrary = !HIPRocThrustPathArg.empty() && D.getVFS().exists(HIPRocThrustPathArg + "/thrust"); - HIPRocPrimPathArg = - Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_prim_path_EQ); + HIPRocPrimPathArg = Args.getLastArgValue(options::OPT_hipstdpar_prim_path_EQ); HasRocPrimLibrary = !HIPRocPrimPathArg.empty() && D.getVFS().exists(HIPRocPrimPathArg + "/rocprim"); - if (auto *A = Args.getLastArg(clang::driver::options::OPT_hip_version_EQ)) { + if (auto *A = Args.getLastArg(options::OPT_hip_version_EQ)) { HIPVersionArg = A->getValue(); unsigned Major = ~0U; unsigned Minor = ~0U; diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index 7185b24aec0f8..c8601d1dedbcb 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -11,9 +11,9 @@ #include "Gnu.h" #include "clang/Basic/TargetID.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/SmallString.h" #include "llvm/TargetParser/TargetParser.h" diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index e73b4f02cac39..a96d40166fdb7 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -11,9 +11,9 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/Tool.h" +#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp index 731076d9754a9..588255dc5a0cd 100644 --- a/clang/lib/Driver/ToolChains/AVR.cpp +++ b/clang/lib/Driver/ToolChains/AVR.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp index e8d5e38a9064f..d6fb2a57539ed 100644 --- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp +++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp @@ -9,7 +9,7 @@ #include "AArch64.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/TargetParser/Host.h" @@ -222,7 +222,7 @@ void aarch64::getAArch64TargetFeatures(const Driver &D, // Default to 'A' profile if the architecture is not specified. success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions); - if (success && (A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ))) + if (success && (A = Args.getLastArg(options::OPT_mtune_EQ))) success = getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args, Features); else if (success && (A = Args.getLastArg(options::OPT_mcpu_EQ))) diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp index 61beb0455147d..55eb2dcf7ddf4 100644 --- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp +++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp @@ -8,7 +8,7 @@ #include "ARM.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/ARMTargetParser.h" @@ -74,7 +74,7 @@ bool arm::isARMEABIBareMetal(const llvm::Triple &Triple) { // Get Arch/CPU from args. void arm::getARMArchCPUFromArgs(const ArgList &Args, llvm::StringRef &Arch, llvm::StringRef &CPU, bool FromAs) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) + if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) CPU = A->getValue(); if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) Arch = A->getValue(); diff --git a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp index 2fd2c72147f5b..65f6534e4d038 100644 --- a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp +++ b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp @@ -8,7 +8,7 @@ #include "CSKY.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/CSKYTargetParser.h" @@ -33,7 +33,7 @@ csky::getCSKYArchName(const Driver &D, const ArgList &Args, return std::optional(A->getValue()); } - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { llvm::CSKY::ArchKind ArchKind = llvm::CSKY::parseCPUArch(A->getValue()); if (ArchKind == llvm::CSKY::ArchKind::INVALID) { D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args); @@ -126,7 +126,7 @@ void csky::getCSKYTargetFeatures(const Driver &D, const llvm::Triple &Triple, archName = A->getValue(); } - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { llvm::CSKY::ArchKind Kind = llvm::CSKY::parseCPUArch(A->getValue()); if (Kind == llvm::CSKY::ArchKind::INVALID) { D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args); diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp index 156ea03045569..da084bdabaee3 100644 --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp @@ -11,7 +11,7 @@ #include "clang/Basic/DiagnosticDriver.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/TargetParser/Host.h" #include "llvm/TargetParser/LoongArchTargetParser.h" @@ -130,8 +130,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D, const ArgList &Args, std::vector &Features) { // Enable the `lsx` feature on 64-bit LoongArch by default. - if (Triple.isLoongArch64() && - (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ))) + if (Triple.isLoongArch64() && (!Args.hasArgNoClaim(options::OPT_march_EQ))) Features.push_back("+lsx"); // -mrelax is default, unless -mno-relax is specified. diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp index 708ec84a37cfb..a620597f10475 100644 --- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp +++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp @@ -8,7 +8,7 @@ #include "M68k.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Regex.h" @@ -21,7 +21,7 @@ using namespace llvm::opt; /// getM68kTargetCPU - Get the (LLVM) name of the 68000 cpu we are targeting. std::string m68k::getM68kTargetCPU(const ArgList &Args) { - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { + if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { // The canonical CPU name is captalize. However, we allow // starting with lower case or numbers only StringRef CPUName = A->getValue(); @@ -45,17 +45,17 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) { .Default(CPUName.str()); } // FIXME: Throw error when multiple sub-architecture flag exist - if (Args.hasArg(clang::driver::options::OPT_m68000)) + if (Args.hasArg(options::OPT_m68000)) return "M68000"; - if (Args.hasArg(clang::driver::options::OPT_m68010)) + if (Args.hasArg(options::OPT_m68010)) return "M68010"; - if (Args.hasArg(clang::driver::options::OPT_m68020)) + if (Args.hasArg(options::OPT_m68020)) return "M68020"; - if (Args.hasArg(clang::driver::options::OPT_m68030)) + if (Args.hasArg(options::OPT_m68030)) return "M68030"; - if (Args.hasArg(clang::driver::options::OPT_m68040)) + if (Args.hasArg(options::OPT_m68040)) return "M68040"; - if (Args.hasArg(clang::driver::options::OPT_m68060)) + if (Args.hasArg(options::OPT_m68060)) return "M68060"; return ""; diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp index 8d7b85dbeed99..103aae7018fbf 100644 --- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp @@ -9,7 +9,7 @@ #include "Mips.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" @@ -49,8 +49,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple, DefMips64CPU = "mips3"; } - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ, - options::OPT_mcpu_EQ)) + if (Arg *A = Args.getLastArg(options::OPT_march_EQ, options::OPT_mcpu_EQ)) CPUName = A->getValue(); if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) { diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp index 361a68a892a8f..44afdd249fea5 100644 --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp @@ -9,7 +9,7 @@ #include "PPC.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp index f2e79e71f93d4..1dcce6d053a39 100644 --- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp +++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp @@ -10,7 +10,7 @@ #include "../Clang.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Error.h" #include "llvm/TargetParser/Host.h" diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp index 94a94f1e9c487..49256d80cbdf6 100644 --- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp @@ -8,7 +8,7 @@ #include "Sparc.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" @@ -122,7 +122,7 @@ sparc::FloatABI sparc::getSparcFloatABI(const Driver &D, std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args, const llvm::Triple &Triple) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { StringRef CPUName = A->getValue(); if (CPUName == "native") { std::string CPU = std::string(llvm::sys::getHostCPUName()); diff --git a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp index 75b6afd925245..1ef6a725483e8 100644 --- a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp +++ b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp @@ -8,7 +8,7 @@ #include "SystemZ.h" #include "clang/Config/config.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" @@ -25,9 +25,9 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D, D.Diag(diag::err_drv_unsupported_opt) << Args.getLastArg(options::OPT_mfloat_abi_EQ)->getAsString(Args); - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_msoft_float, - options::OPT_mhard_float)) - if (A->getOption().matches(clang::driver::options::OPT_msoft_float)) + if (Arg *A = + Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float)) + if (A->getOption().matches(options::OPT_msoft_float)) ABI = systemz::FloatABI::Soft; return ABI; @@ -35,7 +35,7 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D, std::string systemz::getSystemZTargetCPU(const ArgList &Args, const llvm::Triple &T) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { llvm::StringRef CPUName = A->getValue(); if (CPUName == "native") { diff --git a/clang/lib/Driver/ToolChains/Arch/VE.cpp b/clang/lib/Driver/ToolChains/Arch/VE.cpp index adc0873586588..c8353d7dc5f3a 100644 --- a/clang/lib/Driver/ToolChains/Arch/VE.cpp +++ b/clang/lib/Driver/ToolChains/Arch/VE.cpp @@ -8,7 +8,7 @@ #include "VE.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" using namespace clang::driver; diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp index 1373905a5120e..092069b6ade56 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp @@ -8,7 +8,7 @@ #include "X86.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/Option/ArgList.h" @@ -21,7 +21,7 @@ using namespace llvm::opt; std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args, const llvm::Triple &Triple) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { StringRef CPU = A->getValue(); if (CPU != "native") return std::string(CPU); @@ -119,7 +119,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, std::vector &Features) { // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or // "ms_abi" as default function attributes. - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) { StringRef DefaultAbi = (Triple.isOSWindows() || Triple.isUEFI()) ? "ms" : "sysv"; if (A->getValue() != DefaultAbi) @@ -128,7 +128,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, } // If -march=native, autodetect the feature list. - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { if (StringRef(A->getValue()) == "native") { for (auto &F : llvm::sys::getHostCPUFeatures()) Features.push_back( @@ -163,7 +163,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, // flags). This is a bit hacky but keeps existing usages working. We should // consider deprecating this and instead warn if the user requests external // retpoline thunks and *doesn't* request some form of retpolines. - auto SpectreOpt = clang::driver::options::ID::OPT_INVALID; + auto SpectreOpt = options::ID::OPT_INVALID; if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline, options::OPT_mspeculative_load_hardening, options::OPT_mno_speculative_load_hardening)) { @@ -189,7 +189,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, SpectreOpt = options::OPT_mretpoline_external_thunk; } - auto LVIOpt = clang::driver::options::ID::OPT_INVALID; + auto LVIOpt = options::ID::OPT_INVALID; if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening, false)) { Features.push_back("+lvi-load-hardening"); @@ -207,7 +207,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, << D.getOpts().getOptionName(options::OPT_mlvi_hardening) << D.getOpts().getOptionName(options::OPT_m_seses); - if (SpectreOpt != clang::driver::options::ID::OPT_INVALID) + if (SpectreOpt != options::ID::OPT_INVALID) D.Diag(diag::err_drv_argument_not_allowed_with) << D.getOpts().getOptionName(SpectreOpt) << D.getOpts().getOptionName(options::OPT_m_seses); @@ -219,8 +219,8 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, } } - if (SpectreOpt != clang::driver::options::ID::OPT_INVALID && - LVIOpt != clang::driver::options::ID::OPT_INVALID) { + if (SpectreOpt != options::ID::OPT_INVALID && + LVIOpt != options::ID::OPT_INVALID) { D.Diag(diag::err_drv_argument_not_allowed_with) << D.getOpts().getOptionName(SpectreOpt) << D.getOpts().getOptionName(LVIOpt); diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp index 9b7f58c392885..8d598be9ffb0a 100644 --- a/clang/lib/Driver/ToolChains/BareMetal.cpp +++ b/clang/lib/Driver/ToolChains/BareMetal.cpp @@ -18,7 +18,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/MultilibBuilder.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -135,7 +135,7 @@ static std::string computeClangRuntimesSysRoot(const Driver &D, bool BareMetal::initGCCInstallation(const llvm::Triple &Triple, const llvm::opt::ArgList &Args) { if (Args.getLastArg(options::OPT_gcc_toolchain) || - Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) { + Args.getLastArg(clang::options::OPT_gcc_install_dir_EQ)) { GCCInstallation.init(Triple, Args); return GCCInstallation.isValid(); } diff --git a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp index e4db3307ee3aa..c561d7d38da5b 100644 --- a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp +++ b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index c5d8dea358c0a..5c5609a3c8377 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -30,10 +30,10 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Distro.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/Types.h" #include "clang/Driver/XRayArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" @@ -66,7 +66,7 @@ using namespace clang; using namespace llvm::opt; static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) { - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC, + if (Arg *A = Args.getLastArg(options::OPT_C, options::OPT_CC, options::OPT_fminimize_whitespace, options::OPT_fno_minimize_whitespace, options::OPT_fkeep_system_includes, @@ -1700,7 +1700,7 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args, AddAAPCSVolatileBitfieldArgs(Args, CmdArgs); - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { CmdArgs.push_back("-tune-cpu"); if (strcmp(A->getValue(), "native") == 0) CmdArgs.push_back(Args.MakeArgString(llvm::sys::getHostCPUName())); @@ -2106,7 +2106,7 @@ void Clang::AddSparcTargetArgs(const ArgList &Args, CmdArgs.push_back("hard"); } - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { StringRef Name = A->getValue(); std::string TuneCPU; if (Name == "native") @@ -2212,12 +2212,11 @@ void Clang::AddX86TargetArgs(const ArgList &Args, // Default to "generic" unless -march is present or targetting the PS4/PS5. std::string TuneCPU; - if (!Args.hasArg(clang::driver::options::OPT_march_EQ) && - !getToolChain().getTriple().isPS()) + if (!Args.hasArg(options::OPT_march_EQ) && !getToolChain().getTriple().isPS()) TuneCPU = "generic"; // Override based on -mtune. - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { StringRef Name = A->getValue(); if (Name == "native") { diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 52e43d1a003eb..aa84f1ac86658 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -31,11 +31,11 @@ #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" #include "clang/Driver/XRayArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" @@ -88,8 +88,7 @@ static bool addRPathCmdArg(const llvm::opt::ArgList &Args, static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args, const llvm::Triple &Triple) { - if (Args.hasArg(clang::driver::options::OPT_pg) && - !Args.hasArg(clang::driver::options::OPT_mfentry)) + if (Args.hasArg(options::OPT_pg) && !Args.hasArg(options::OPT_mfentry)) return true; if (Triple.isAndroid()) @@ -268,17 +267,16 @@ getFramePointerKind(const llvm::opt::ArgList &Args, // without requiring new frame records to be created. bool DefaultFP = useFramePointerForTargetByDefault(Args, Triple); - bool EnableFP = - mustUseNonLeafFramePointerForTarget(Triple) || - Args.hasFlag(clang::driver::options::OPT_fno_omit_frame_pointer, - clang::driver::options::OPT_fomit_frame_pointer, DefaultFP); + bool EnableFP = mustUseNonLeafFramePointerForTarget(Triple) || + Args.hasFlag(options::OPT_fno_omit_frame_pointer, + options::OPT_fomit_frame_pointer, DefaultFP); bool DefaultLeafFP = useLeafFramePointerForTargetByDefault(Triple) || (EnableFP && framePointerImpliesLeafFramePointer(Args, Triple)); - bool EnableLeafFP = Args.hasFlag( - clang::driver::options::OPT_mno_omit_leaf_frame_pointer, - clang::driver::options::OPT_momit_leaf_frame_pointer, DefaultLeafFP); + bool EnableLeafFP = + Args.hasFlag(options::OPT_mno_omit_leaf_frame_pointer, + options::OPT_momit_leaf_frame_pointer, DefaultLeafFP); bool FPRegReserved = EnableFP || mustMaintainValidFrameChain(Args, Triple); @@ -777,7 +775,7 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args, case llvm::Triple::ppcle: case llvm::Triple::ppc64: case llvm::Triple::ppc64le: - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) + if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) return std::string( llvm::PPC::getNormalizedPPCTargetCPU(T, A->getValue())); return std::string(llvm::PPC::getNormalizedPPCTargetCPU(T)); @@ -1440,7 +1438,7 @@ void tools::addOpenMPRuntimeSpecificRPath(const ToolChain &TC, addRPathCmdArg(Args, CmdArgs, CandidateRPath_lib); std::string rocmPath = - Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ).str(); + Args.getLastArgValue(options::OPT_rocm_path_EQ).str(); if (rocmPath.size() != 0) { std::string rocmPath_lib = rocmPath + "/lib"; std::string rocmPath_suf = rocmPath + "/" + LibSuffix; @@ -1869,7 +1867,7 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, if (SanArgs.needsFuzzerInterceptors()) addSanitizerRuntime(TC, Args, CmdArgs, "fuzzer_interceptors", false, true); - if (!Args.hasArg(clang::driver::options::OPT_nostdlibxx)) { + if (!Args.hasArg(options::OPT_nostdlibxx)) { bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) && !Args.hasArg(options::OPT_static); if (OnlyLibstdcxxStatic) @@ -3585,7 +3583,7 @@ void tools::handleInterchangeLoopsArgs(const ArgList &Args, // Otherwise, return an empty string and issue a diagnosic message if needed. StringRef tools::parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags, const llvm::opt::ArgList &Args) { - Arg *A = Args.getLastArg(clang::driver::options::OPT_mprefer_vector_width_EQ); + Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ); if (!A) return ""; diff --git a/clang/lib/Driver/ToolChains/CrossWindows.cpp b/clang/lib/Driver/ToolChains/CrossWindows.cpp index 51c892fc91718..6df5315e8fff8 100644 --- a/clang/lib/Driver/ToolChains/CrossWindows.cpp +++ b/clang/lib/Driver/ToolChains/CrossWindows.cpp @@ -10,8 +10,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 8fc1bded2acea..de3cf7abbdec5 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -15,7 +15,7 @@ #include "clang/Driver/Distro.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE #include "llvm/Option/ArgList.h" @@ -154,16 +154,16 @@ CudaInstallationDetector::CudaInstallationDetector( std::initializer_list Versions = {"8.0", "7.5", "7.0"}; auto &FS = D.getVFS(); - if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) { + if (Args.hasArg(options::OPT_cuda_path_EQ)) { Candidates.emplace_back( - Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str()); + Args.getLastArgValue(options::OPT_cuda_path_EQ).str()); } else if (HostTriple.isOSWindows()) { for (const char *Ver : Versions) Candidates.emplace_back( D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" + Ver); } else { - if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) { + if (!Args.hasArg(options::OPT_cuda_path_ignore_env)) { // Try to find ptxas binary. If the executable is located in a directory // called 'bin/', its parent directory might be a good guess for a valid // CUDA installation. diff --git a/clang/lib/Driver/ToolChains/Cygwin.cpp b/clang/lib/Driver/ToolChains/Cygwin.cpp index d9c16347daa34..55438125ce0f1 100644 --- a/clang/lib/Driver/ToolChains/Cygwin.cpp +++ b/clang/lib/Driver/ToolChains/Cygwin.cpp @@ -10,7 +10,7 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -58,7 +58,7 @@ void Cygwin::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index 2fb7652d64536..fc3cd9030f71d 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -14,8 +14,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" @@ -1079,7 +1079,7 @@ StringRef MachO::getMachOArchName(const ArgList &Args) const { case llvm::Triple::thumb: case llvm::Triple::arm: - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) if (const char *Arch = ArmMachOArchName(A->getValue())) return Arch; @@ -2993,7 +2993,7 @@ DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args, if (!BoundArch.empty()) { StringRef Name = BoundArch; const Option MCpu = Opts.getOption(options::OPT_mcpu_EQ); - const Option MArch = Opts.getOption(clang::driver::options::OPT_march_EQ); + const Option MArch = Opts.getOption(options::OPT_march_EQ); // This code must be kept in sync with LLVM's getArchTypeForDarwinArch, // which defines the list of which architectures we accept. diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp index 524f5f2ff391e..d4a6d6ae3e349 100644 --- a/clang/lib/Driver/ToolChains/DragonFly.cpp +++ b/clang/lib/Driver/ToolChains/DragonFly.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -219,7 +219,7 @@ void DragonFly::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index f24dd3982eab7..8d53547f8ab66 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -11,7 +11,7 @@ #include "clang/Basic/CodeGenOptions.h" #include "clang/Driver/CommonArgs.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Frontend/Debug/Options.h" #include "llvm/Support/Path.h" #include "llvm/TargetParser/Host.h" @@ -238,7 +238,7 @@ void Flang::addCodegenOptions(const ArgList &Args, options::OPT_ftime_report, options::OPT_ftime_report_EQ, options::OPT_funroll_loops, options::OPT_fno_unroll_loops, options::OPT_fdefer_desc_map, options::OPT_fno_defer_desc_map}); - if (Args.hasArg(clang::driver::options::OPT_fcoarray)) + if (Args.hasArg(options::OPT_fcoarray)) CmdArgs.push_back("-fcoarray"); } diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp index b17b76233ad30..70e66a2f5c3e7 100644 --- a/clang/lib/Driver/ToolChains/FreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp @@ -13,8 +13,8 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" @@ -404,7 +404,7 @@ void FreeBSD::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp index 507cc03b27513..9edfc4de3d602 100644 --- a/clang/lib/Driver/ToolChains/Fuchsia.cpp +++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/MultilibBuilder.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -344,7 +344,7 @@ Tool *Fuchsia::buildStaticLibTool() const { ToolChain::RuntimeLibType Fuchsia::GetRuntimeLibType(const ArgList &Args) const { - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) { + if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) { StringRef Value = A->getValue(); if (Value != "compiler-rt") getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index b7cd5f6be14a6..d56cb02e14ca9 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -20,9 +20,9 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/MultilibBuilder.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/Twine.h" #include "llvm/Option/ArgList.h" @@ -2104,7 +2104,7 @@ Generic_GCC::GCCVersion Generic_GCC::GCCVersion::Parse(StringRef VersionText) { static llvm::StringRef getGCCToolchainDir(const ArgList &Args, llvm::StringRef SysRoot) { - const Arg *A = Args.getLastArg(clang::driver::options::OPT_gcc_toolchain); + const Arg *A = Args.getLastArg(options::OPT_gcc_toolchain); if (A) return A->getValue(); @@ -2157,8 +2157,7 @@ void Generic_GCC::GCCInstallationDetector::init( CandidateBiarchTripleAliases); // If --gcc-install-dir= is specified, skip filesystem detection. - if (const Arg *A = - Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ); + if (const Arg *A = Args.getLastArg(options::OPT_gcc_install_dir_EQ); A && A->getValue()[0]) { StringRef InstallDir = A->getValue(); if (!ScanGCCForMultilibs(TargetTriple, Args, InstallDir, false)) { @@ -2181,8 +2180,7 @@ void Generic_GCC::GCCInstallationDetector::init( // If --gcc-triple is specified use this instead of trying to // auto-detect a triple. - if (const Arg *A = - Args.getLastArg(clang::driver::options::OPT_gcc_triple_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_gcc_triple_EQ)) { StringRef GCCTriple = A->getValue(); CandidateTripleAliases.clear(); CandidateTripleAliases.push_back(GCCTriple); diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index d0d2d2e34b602..b1c30beae1d35 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -15,8 +15,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/TargetParser/TargetParser.h" diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp index fb738577c4c44..44265e8a8b95f 100644 --- a/clang/lib/Driver/ToolChains/HIPSPV.cpp +++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp @@ -12,7 +12,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -213,7 +213,7 @@ HIPSPVToolChain::getDeviceLibs( // Find device libraries in --hip-device-lib-path and HIP_DEVICE_LIB_PATH. auto HipDeviceLibPathArgs = DriverArgs.getAllArgValues( // --hip-device-lib-path is alias to this option. - clang::driver::options::OPT_rocm_device_lib_path_EQ); + options::OPT_rocm_device_lib_path_EQ); for (auto Path : HipDeviceLibPathArgs) LibraryPaths.push_back(DriverArgs.MakeArgString(Path)); diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp index 732403e69a075..1af2ae6470f1e 100644 --- a/clang/lib/Driver/ToolChains/HIPUtility.cpp +++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp @@ -9,7 +9,7 @@ #include "HIPUtility.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/Archive.h" diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp index 9f8b676fc7dc2..084f51721315c 100644 --- a/clang/lib/Driver/ToolChains/Hexagon.cpp +++ b/clang/lib/Driver/ToolChains/Hexagon.cpp @@ -11,7 +11,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Hurd.cpp b/clang/lib/Driver/ToolChains/Hurd.cpp index 43121233ea7d0..53ee4d4c0cbde 100644 --- a/clang/lib/Driver/ToolChains/Hurd.cpp +++ b/clang/lib/Driver/ToolChains/Hurd.cpp @@ -10,7 +10,7 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -168,7 +168,7 @@ void Hurd::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index f452109134171..3b9f6377fd8e1 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -16,8 +16,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Distro.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Path.h" @@ -731,10 +731,10 @@ void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; - if (DriverArgs.hasArg(clang::driver::options::OPT_fopenmp)) { + if (DriverArgs.hasArg(options::OPT_fopenmp)) { // Look for system files in our compiler AOMP/include dir first addSystemInclude(DriverArgs, CC1Args, DriverArgs.MakeArgString(D.Dir + "/../include")); diff --git a/clang/lib/Driver/ToolChains/MSP430.cpp b/clang/lib/Driver/ToolChains/MSP430.cpp index 9eca1ad5f2865..3cc56bb7e832e 100644 --- a/clang/lib/Driver/ToolChains/MSP430.cpp +++ b/clang/lib/Driver/ToolChains/MSP430.cpp @@ -12,7 +12,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Multilib.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index 0d4bbabb9bb8a..e9972cb24c245 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/ConvertUTF.h" diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp index da4a9072317f4..1bbabdfc631b8 100644 --- a/clang/lib/Driver/ToolChains/Managarm.cpp +++ b/clang/lib/Driver/ToolChains/Managarm.cpp @@ -11,8 +11,8 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -136,7 +136,7 @@ void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp index bd0e40ae3d7ad..0e8eeeb0523ce 100644 --- a/clang/lib/Driver/ToolChains/MinGW.cpp +++ b/clang/lib/Driver/ToolChains/MinGW.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" diff --git a/clang/lib/Driver/ToolChains/MipsLinux.cpp b/clang/lib/Driver/ToolChains/MipsLinux.cpp index 7dd3936613296..58d6b5031f536 100644 --- a/clang/lib/Driver/ToolChains/MipsLinux.cpp +++ b/clang/lib/Driver/ToolChains/MipsLinux.cpp @@ -9,7 +9,7 @@ #include "MipsLinux.h" #include "Arch/Mips.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -38,7 +38,7 @@ MipsLLVMToolChain::MipsLLVMToolChain(const Driver &D, void MipsLLVMToolChain::AddClangSystemIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; const Driver &D = getDriver(); diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp index 8db00deeb80df..ea722b59853d6 100644 --- a/clang/lib/Driver/ToolChains/NetBSD.cpp +++ b/clang/lib/Driver/ToolChains/NetBSD.cpp @@ -14,8 +14,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" @@ -466,7 +466,7 @@ void NetBSD::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp index 00991504e97a8..607eb714f85dc 100644 --- a/clang/lib/Driver/ToolChains/OHOS.cpp +++ b/clang/lib/Driver/ToolChains/OHOS.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/FileSystem.h" @@ -174,7 +174,7 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) ToolChain::RuntimeLibType OHOS::GetRuntimeLibType( const ArgList &Args) const { - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) { + if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) { StringRef Value = A->getValue(); if (Value != "compiler-rt") getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name) diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp index 8f589186af343..5e7b4f1a664e6 100644 --- a/clang/lib/Driver/ToolChains/OpenBSD.cpp +++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp @@ -13,8 +13,8 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -315,7 +315,7 @@ void OpenBSD::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp index 8d381c4f14371..76180431ee682 100644 --- a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp @@ -8,7 +8,7 @@ #include "PPCFreeBSD.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/Path.h" using namespace clang::driver::toolchains; @@ -16,7 +16,7 @@ using namespace llvm::opt; void PPCFreeBSDToolChain::AddClangSystemIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) && + if (!DriverArgs.hasArg(options::OPT_nostdinc) && !DriverArgs.hasArg(options::OPT_nobuiltininc)) { const Driver &D = getDriver(); SmallString<128> P(D.ResourceDir); diff --git a/clang/lib/Driver/ToolChains/PPCLinux.cpp b/clang/lib/Driver/ToolChains/PPCLinux.cpp index 768214e416bd7..672ebd5b7b98d 100644 --- a/clang/lib/Driver/ToolChains/PPCLinux.cpp +++ b/clang/lib/Driver/ToolChains/PPCLinux.cpp @@ -8,7 +8,7 @@ #include "PPCLinux.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -58,7 +58,7 @@ PPCLinuxToolChain::PPCLinuxToolChain(const Driver &D, void PPCLinuxToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) && + if (!DriverArgs.hasArg(options::OPT_nostdinc) && !DriverArgs.hasArg(options::OPT_nobuiltininc)) { const Driver &D = getDriver(); SmallString<128> P(D.ResourceDir); diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp index 34ec65ae59602..6fe18aa4cceba 100644 --- a/clang/lib/Driver/ToolChains/PS4CPU.cpp +++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp @@ -11,8 +11,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/SPIRV.cpp b/clang/lib/Driver/ToolChains/SPIRV.cpp index afb9e63b4b348..22aad7e3f49d7 100644 --- a/clang/lib/Driver/ToolChains/SPIRV.cpp +++ b/clang/lib/Driver/ToolChains/SPIRV.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" using namespace clang::driver; using namespace clang::driver::toolchains; diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 0232b047a6c4b..85859f344b491 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -20,7 +20,7 @@ SYCLInstallationDetector::SYCLInstallationDetector( void SYCLInstallationDetector::addSYCLIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nobuiltininc)) + if (DriverArgs.hasArg(options::OPT_nobuiltininc)) return; // Add the SYCL header search locations in the specified order. diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp index 64c7d1ceb3a36..ad0f41144f393 100644 --- a/clang/lib/Driver/ToolChains/Solaris.cpp +++ b/clang/lib/Driver/ToolChains/Solaris.cpp @@ -13,9 +13,9 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" @@ -360,7 +360,7 @@ void Solaris::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/UEFI.cpp b/clang/lib/Driver/ToolChains/UEFI.cpp index d2be147c7b9f6..7732e37f8061d 100644 --- a/clang/lib/Driver/ToolChains/UEFI.cpp +++ b/clang/lib/Driver/ToolChains/UEFI.cpp @@ -11,8 +11,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" diff --git a/clang/lib/Driver/ToolChains/VEToolchain.cpp b/clang/lib/Driver/ToolChains/VEToolchain.cpp index ad9129046c3e1..78509bcdae0fe 100644 --- a/clang/lib/Driver/ToolChains/VEToolchain.cpp +++ b/clang/lib/Driver/ToolChains/VEToolchain.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" #include // ::getenv @@ -78,7 +78,7 @@ bool VEToolChain::hasBlocksRuntime() const { return false; } void VEToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (DriverArgs.hasArg(options::OPT_nobuiltininc) && @@ -117,7 +117,7 @@ void VEToolChain::addClangTargetOptions(const ArgList &DriverArgs, void VEToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) || + if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp index 5054868b5ff4d..15c6f19e87fee 100644 --- a/clang/lib/Driver/ToolChains/WebAssembly.cpp +++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp @@ -12,7 +12,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_STRING #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" @@ -297,7 +297,7 @@ bool WebAssembly::HasNativeLLVMSupport() const { return true; } void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs, ArgStringList &CC1Args, Action::OffloadKind) const { - if (!DriverArgs.hasFlag(clang::driver::options::OPT_fuse_init_array, + if (!DriverArgs.hasFlag(options::OPT_fuse_init_array, options::OPT_fno_use_init_array, true)) CC1Args.push_back("-fno-use-init-array"); @@ -472,7 +472,7 @@ WebAssembly::GetCXXStdlibType(const ArgList &Args) const { void WebAssembly::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; const Driver &D = getDriver(); diff --git a/clang/lib/Driver/ToolChains/XCore.cpp b/clang/lib/Driver/ToolChains/XCore.cpp index 6a2a75cb99739..dd26c11affffb 100644 --- a/clang/lib/Driver/ToolChains/XCore.cpp +++ b/clang/lib/Driver/ToolChains/XCore.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include // ::getenv @@ -113,7 +113,7 @@ bool XCoreToolChain::hasBlocksRuntime() const { return false; } void XCoreToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) || + if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc)) return; if (const char *cl_include_dir = getenv("XCC_C_INCLUDE_PATH")) { @@ -137,7 +137,7 @@ void XCoreToolChain::addClangTargetOptions(const ArgList &DriverArgs, void XCoreToolChain::AddClangCXXStdlibIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) || + if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp index 9a3c45323a3cf..eac8f623f9a50 100644 --- a/clang/lib/Driver/ToolChains/ZOS.cpp +++ b/clang/lib/Driver/ToolChains/ZOS.cpp @@ -9,7 +9,7 @@ #include "ZOS.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/WithColor.h" diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp index 0325296f84b19..4c2d11751a363 100644 --- a/clang/lib/Driver/XRayArgs.cpp +++ b/clang/lib/Driver/XRayArgs.cpp @@ -8,8 +8,8 @@ #include "clang/Driver/XRayArgs.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/SpecialCaseList.h" diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt index a916667208845..dac9e0d26f393 100644 --- a/clang/lib/Frontend/CMakeLists.txt +++ b/clang/lib/Frontend/CMakeLists.txt @@ -52,6 +52,7 @@ add_clang_library(clangFrontend clangAST clangBasic clangDriver + clangOptions clangEdit clangLex clangParse diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 66759d363d531..0782dc1b585c3 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -27,7 +27,6 @@ #include "clang/Basic/XRayInstr.h" #include "clang/Config/config.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CommandLineSourceLoc.h" #include "clang/Frontend/DependencyOutputOptions.h" #include "clang/Frontend/FrontendDiagnostic.h" @@ -38,6 +37,7 @@ #include "clang/Frontend/Utils.h" #include "clang/Lex/HeaderSearchOptions.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Options/Options.h" #include "clang/Serialization/ASTBitCodes.h" #include "clang/Serialization/ModuleFileExtension.h" #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h" @@ -255,7 +255,7 @@ CowCompilerInvocation::getMutPreprocessorOutputOpts() { using ArgumentConsumer = CompilerInvocation::ArgumentConsumer; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_STR_TABLE_CODE static llvm::StringRef lookupStrInTable(unsigned Offset) { @@ -263,7 +263,7 @@ static llvm::StringRef lookupStrInTable(unsigned Offset) { } #define SIMPLE_ENUM_VALUE_TABLE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef SIMPLE_ENUM_VALUE_TABLE static std::optional normalizeSimpleFlag(OptSpecifier Opt, @@ -989,7 +989,7 @@ static void GenerateAnalyzerArgs(const AnalyzerOptions &Opts, #define ANALYZER_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef ANALYZER_OPTION_WITH_MARSHALLING if (Opts.AnalysisConstraintsOpt != RangeConstraintsModel) { @@ -1076,7 +1076,7 @@ static bool ParseAnalyzerArgs(AnalyzerOptions &Opts, ArgList &Args, #define ANALYZER_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef ANALYZER_OPTION_WITH_MARSHALLING if (Arg *A = Args.getLastArg(OPT_analyzer_constraints)) { @@ -1583,7 +1583,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, #define CODEGEN_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef CODEGEN_OPTION_WITH_MARSHALLING if (Opts.OptimizationLevel > 0) { @@ -1888,7 +1888,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, #define CODEGEN_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef CODEGEN_OPTION_WITH_MARSHALLING // At O0 we want to fully disable inlining outside of cases marked with @@ -2379,7 +2379,7 @@ static void GenerateDependencyOutputArgs(const DependencyOutputOptions &Opts, const DependencyOutputOptions &DependencyOutputOpts = Opts; #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING if (Opts.ShowIncludesDest != ShowIncludesDestination::None) @@ -2414,7 +2414,7 @@ static bool ParseDependencyOutputArgs(DependencyOutputOptions &Opts, DependencyOutputOptions &DependencyOutputOpts = Opts; #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING if (Args.hasArg(OPT_show_includes)) { @@ -2542,7 +2542,7 @@ static void GenerateFileSystemArgs(const FileSystemOptions &Opts, #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING } @@ -2554,7 +2554,7 @@ static bool ParseFileSystemArgs(FileSystemOptions &Opts, const ArgList &Args, #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING return Diags.getNumErrors() == NumErrorsBefore; @@ -2565,7 +2565,7 @@ static void GenerateMigratorArgs(const MigratorOptions &Opts, const MigratorOptions &MigratorOpts = Opts; #define MIGRATOR_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef MIGRATOR_OPTION_WITH_MARSHALLING } @@ -2577,7 +2577,7 @@ static bool ParseMigratorArgs(MigratorOptions &Opts, const ArgList &Args, #define MIGRATOR_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef MIGRATOR_OPTION_WITH_MARSHALLING return Diags.getNumErrors() == NumErrorsBefore; @@ -2589,7 +2589,7 @@ void CompilerInvocationBase::GenerateDiagnosticArgs( const DiagnosticOptions *DiagnosticOpts = &Opts; #define DIAG_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef DIAG_OPTION_WITH_MARSHALLING if (!Opts.DiagnosticSerializationFile.empty()) @@ -2694,7 +2694,7 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args, #define DIAG_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, *Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef DIAG_OPTION_WITH_MARSHALLING llvm::sys::Process::UseANSIEscapeCodes(Opts.UseANSIEscapeCodes); @@ -2844,7 +2844,7 @@ static void GenerateFrontendArgs(const FrontendOptions &Opts, const FrontendOptions &FrontendOpts = Opts; #define FRONTEND_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef FRONTEND_OPTION_WITH_MARSHALLING std::optional ProgramActionOpt = @@ -3014,7 +3014,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args, #define FRONTEND_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef FRONTEND_OPTION_WITH_MARSHALLING Opts.ProgramAction = frontend::ParseSyntaxOnly; @@ -3296,7 +3296,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts, const HeaderSearchOptions *HeaderSearchOpts = &Opts; #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING if (Opts.UseLibcxx) @@ -3411,7 +3411,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args, #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING if (const Arg *A = Args.getLastArg(OPT_stdlib_EQ)) @@ -3744,7 +3744,7 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, #define LANG_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // The '-fcf-protection=' option is generated by CodeGenOpts generator. @@ -4153,7 +4153,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, #define LANG_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) { @@ -4874,7 +4874,7 @@ static void GeneratePreprocessorArgs(const PreprocessorOptions &Opts, #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef PREPROCESSOR_OPTION_WITH_MARSHALLING if (Opts.PCHWithHdrStop && !Opts.PCHWithHdrStopCreate) @@ -4948,7 +4948,7 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args, #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef PREPROCESSOR_OPTION_WITH_MARSHALLING Opts.PCHWithHdrStop = Args.hasArg(OPT_pch_through_hdrstop_create) || @@ -5041,7 +5041,7 @@ GeneratePreprocessorOutputArgs(const PreprocessorOutputOptions &Opts, #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING bool Generate_dM = isStrictlyPreprocessorAction(Action) && !Opts.ShowCPP; @@ -5062,7 +5062,7 @@ static bool ParsePreprocessorOutputArgs(PreprocessorOutputOptions &Opts, #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING Opts.ShowCPP = isStrictlyPreprocessorAction(Action) && !Args.hasArg(OPT_dM); @@ -5077,7 +5077,7 @@ static void GenerateTargetArgs(const TargetOptions &Opts, const TargetOptions *TargetOpts = &Opts; #define TARGET_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef TARGET_OPTION_WITH_MARSHALLING if (!Opts.SDKVersion.empty()) @@ -5096,7 +5096,7 @@ static bool ParseTargetArgs(TargetOptions &Opts, ArgList &Args, #define TARGET_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef TARGET_OPTION_WITH_MARSHALLING if (Arg *A = Args.getLastArg(options::OPT_target_sdk_version_EQ)) { diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp index 99212b81fe064..73b81ed906808 100644 --- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp +++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp @@ -14,11 +14,11 @@ #include "clang/Driver/Action.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/Utils.h" +#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Option/ArgList.h" @@ -61,11 +61,11 @@ clang::createInvocation(ArrayRef ArgList, if (!C) return nullptr; - if (C->getArgs().hasArg(driver::options::OPT_fdriver_only)) + if (C->getArgs().hasArg(options::OPT_fdriver_only)) return nullptr; // Just print the cc1 options if -### was present. - if (C->getArgs().hasArg(driver::options::OPT__HASH_HASH_HASH)) { + if (C->getArgs().hasArg(options::OPT__HASH_HASH_HASH)) { C->getJobs().Print(llvm::errs(), "\n", true); return nullptr; } diff --git a/clang/lib/FrontendTool/CMakeLists.txt b/clang/lib/FrontendTool/CMakeLists.txt index 061e54c3e62d0..66213f76eb968 100644 --- a/clang/lib/FrontendTool/CMakeLists.txt +++ b/clang/lib/FrontendTool/CMakeLists.txt @@ -7,6 +7,7 @@ set(link_libs clangBasic clangCodeGen clangDriver + clangOptions clangExtractAPI clangFrontend clangRewriteFrontend diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index c8aad4daa1c10..e571193c6a9c8 100644 --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -13,7 +13,6 @@ #include "clang/CodeGen/CodeGenAction.h" #include "clang/Config/config.h" -#include "clang/Driver/Options.h" #include "clang/ExtractAPI/FrontendActions.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/CompilerInvocation.h" @@ -22,6 +21,7 @@ #include "clang/Frontend/FrontendPluginRegistry.h" #include "clang/Frontend/Utils.h" #include "clang/FrontendTool/Utils.h" +#include "clang/Options/Options.h" #include "clang/Rewrite/Frontend/FrontendActions.h" #include "clang/StaticAnalyzer/Frontend/AnalyzerHelpFlags.h" #include "clang/StaticAnalyzer/Frontend/FrontendActions.h" @@ -215,11 +215,11 @@ bool ExecuteCompilerInvocation(CompilerInstance *Clang) { // Honor -help. if (Clang->getFrontendOpts().ShowHelp) { - driver::getDriverOptTable().printHelp( + getDriverOptTable().printHelp( llvm::outs(), "clang -cc1 [options] file...", "LLVM 'Clang' Compiler: http://clang.llvm.org", /*ShowHidden=*/false, /*ShowAllAliases=*/false, - llvm::opt::Visibility(driver::options::CC1Option)); + llvm::opt::Visibility(options::CC1Option)); return true; } diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp index 76338065d2231..7764fa7dc92b9 100644 --- a/clang/lib/Interpreter/Interpreter.cpp +++ b/clang/lib/Interpreter/Interpreter.cpp @@ -33,7 +33,6 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" @@ -43,6 +42,7 @@ #include "clang/Interpreter/Interpreter.h" #include "clang/Interpreter/Value.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Options/Options.h" #include "clang/Sema/Lookup.h" #include "clang/Serialization/ObjectFilePCHContainerReader.h" #include "llvm/ExecutionEngine/JITSymbol.h" @@ -185,7 +185,7 @@ IncrementalCompilerBuilder::create(std::string TT, llvm::ArrayRef RF = llvm::ArrayRef(ClangArgv); std::unique_ptr Compilation(Driver.BuildCompilation(RF)); - if (Compilation->getArgs().hasArg(driver::options::OPT_v)) + if (Compilation->getArgs().hasArg(options::OPT_v)) Compilation->getJobs().Print(llvm::errs(), "\n", /*Quote=*/false); auto ErrOrCC1Args = GetCC1Arguments(&Diags, Compilation.get()); diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h index fbf9814b0d4a7..4efe8b9fbc6cc 100644 --- a/clang/lib/Interpreter/InterpreterUtils.h +++ b/clang/lib/Interpreter/InterpreterUtils.h @@ -21,11 +21,11 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/TextDiagnosticBuffer.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Options/Options.h" #include "clang/Sema/Lookup.h" #include "llvm/IR/Module.h" diff --git a/clang/lib/Options/CMakeLists.txt b/clang/lib/Options/CMakeLists.txt new file mode 100644 index 0000000000000..a762e9918b41c --- /dev/null +++ b/clang/lib/Options/CMakeLists.txt @@ -0,0 +1,18 @@ +set(LLVM_LINK_COMPONENTS + Option + Support +) + +add_clang_library(clangOptions + DriverOptions.cpp + OptionUtils.cpp + + DEPENDS + ClangDriverOptions + # These generated headers are included transitively. + target_parser_gen + + LINK_LIBS + clangBasic + ${system_libs} +) diff --git a/clang/lib/Driver/DriverOptions.cpp b/clang/lib/Options/DriverOptions.cpp similarity index 76% rename from clang/lib/Driver/DriverOptions.cpp rename to clang/lib/Options/DriverOptions.cpp index cde1f8989935b..d91e9291fb2f6 100644 --- a/clang/lib/Driver/DriverOptions.cpp +++ b/clang/lib/Options/DriverOptions.cpp @@ -6,33 +6,32 @@ // //===----------------------------------------------------------------------===// -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/OptTable.h" #include -using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace llvm::opt; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_STR_TABLE_CODE #define OPTTABLE_VALUES_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_VALUES_CODE #define OPTTABLE_PREFIXES_TABLE_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_PREFIXES_TABLE_CODE #define OPTTABLE_PREFIXES_UNION_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_PREFIXES_UNION_CODE static constexpr OptTable::Info InfoTable[] = { #define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__), -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTION }; @@ -44,9 +43,9 @@ class DriverOptTable : public PrecomputedOptTable { : PrecomputedOptTable(OptionStrTable, OptionPrefixesTable, InfoTable, OptionPrefixesUnion) {} }; -} +} // anonymous namespace -const llvm::opt::OptTable &clang::driver::getDriverOptTable() { +const llvm::opt::OptTable &clang::getDriverOptTable() { static DriverOptTable Table; return Table; } diff --git a/clang/lib/Driver/OptionUtils.cpp b/clang/lib/Options/OptionUtils.cpp similarity index 97% rename from clang/lib/Driver/OptionUtils.cpp rename to clang/lib/Options/OptionUtils.cpp index 1f36ffc03cab3..fcafd3c83c6b3 100644 --- a/clang/lib/Driver/OptionUtils.cpp +++ b/clang/lib/Options/OptionUtils.cpp @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// +#include "clang/Options/OptionUtils.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticDriver.h" -#include "clang/Driver/OptionUtils.h" #include "llvm/Option/ArgList.h" using namespace clang; diff --git a/clang/lib/Tooling/CMakeLists.txt b/clang/lib/Tooling/CMakeLists.txt index fc1f1f9f9d367..faaa53276d0e6 100644 --- a/clang/lib/Tooling/CMakeLists.txt +++ b/clang/lib/Tooling/CMakeLists.txt @@ -40,6 +40,7 @@ add_clang_library(clangTooling clangASTMatchers clangBasic clangDriver + clangOptions clangFormat clangFrontend clangLex diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp index 28568426a6c48..e9b72388ae4df 100644 --- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp +++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp @@ -44,8 +44,8 @@ #include "clang/Basic/LangStandard.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Types.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -164,11 +164,11 @@ struct TransferableCommand { // We parse each argument individually so that we can retain the exact // spelling of each argument; re-rendering is lossy for aliased flags. // E.g. in CL mode, /W4 maps to -Wall. - auto &OptTable = clang::driver::getDriverOptTable(); + auto &OptTable = getDriverOptTable(); if (!OldArgs.empty()) Cmd.CommandLine.emplace_back(OldArgs.front()); for (unsigned Pos = 1; Pos < OldArgs.size();) { - using namespace driver::options; + using namespace options; const unsigned OldPos = Pos; std::unique_ptr Arg(OptTable.ParseOneArg( @@ -296,14 +296,14 @@ struct TransferableCommand { // Try to interpret the argument as a type specifier, e.g. '-x'. std::optional tryParseTypeArg(const llvm::opt::Arg &Arg) { const llvm::opt::Option &Opt = Arg.getOption(); - using namespace driver::options; + using namespace options; if (ClangCLMode) { if (Opt.matches(OPT__SLASH_TC) || Opt.matches(OPT__SLASH_Tc)) return types::TY_C; if (Opt.matches(OPT__SLASH_TP) || Opt.matches(OPT__SLASH_Tp)) return types::TY_CXX; } else { - if (Opt.matches(driver::options::OPT_x)) + if (Opt.matches(options::OPT_x)) return types::lookupTypeForTypeSpecifier(Arg.getValue()); } return std::nullopt; @@ -311,7 +311,7 @@ struct TransferableCommand { // Try to interpret the argument as '-std='. std::optional tryParseStdArg(const llvm::opt::Arg &Arg) { - using namespace driver::options; + using namespace options; if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ)) { // "c++latest" is not a recognized LangStandard, but it's accepted by // the clang driver in CL mode. diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index e8eef5ed9c9fa..1f6a5c94601fc 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -21,7 +21,6 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" #include "clang/Frontend/ASTUnit.h" @@ -32,6 +31,7 @@ #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Lex/HeaderSearchOptions.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Options/Options.h" #include "clang/Tooling/ArgumentsAdjusters.h" #include "clang/Tooling/CompilationDatabase.h" #include "llvm/ADT/ArrayRef.h" @@ -270,17 +270,15 @@ void addTargetAndModeForProgramName(std::vector &CommandLine, StringRef InvokedAs) { if (CommandLine.empty() || InvokedAs.empty()) return; - const auto &Table = driver::getDriverOptTable(); + const auto &Table = getDriverOptTable(); // --target=X - StringRef TargetOPT = - Table.getOption(driver::options::OPT_target).getPrefixedName(); + StringRef TargetOPT = Table.getOption(options::OPT_target).getPrefixedName(); // -target X StringRef TargetOPTLegacy = - Table.getOption(driver::options::OPT_target_legacy_spelling) - .getPrefixedName(); + Table.getOption(options::OPT_target_legacy_spelling).getPrefixedName(); // --driver-mode=X StringRef DriverModeOPT = - Table.getOption(driver::options::OPT_driver_mode).getPrefixedName(); + Table.getOption(options::OPT_driver_mode).getPrefixedName(); auto TargetMode = driver::ToolChain::getTargetAndModeFromProgramName(InvokedAs); // No need to search for target args if we don't have a target/mode to insert. diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp index 3fb78dc871904..4eab3999dfc42 100644 --- a/clang/test/CIR/CodeGen/complex.cpp +++ b/clang/test/CIR/CodeGen/complex.cpp @@ -1495,3 +1495,42 @@ void calling_function_that_return_complex() { // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 // OGCG: store float %[[RESULT_REAL]], ptr %[[A_REAL_PTR]], align 4 // OGCG: store float %[[RESULT_IMAG]], ptr %[[A_IMAG_PTR]], align 4 + +void imag_literal_gnu_extension() { + float _Complex a = 3.0fi; + double _Complex b = 3.0i; + int _Complex c = 3i; +} + +// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a", init] +// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["c", init] +// CIR: %[[COMPLEX_A:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.float, #cir.fp<3.000000e+00> : !cir.float> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX_A]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR: %[[COMPLEX_B:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.double, #cir.fp<3.000000e+00> : !cir.double> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX_B]], %[[B_ADDR]] : !cir.complex, !cir.ptr> +// CIR: %[[COMPLEX_C:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<3> : !s32i> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX_C]], %[[C_ADDR]] : !cir.complex, !cir.ptr> + +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8 +// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4 +// LLVM: store { float, float } { float 0.000000e+00, float 3.000000e+00 }, ptr %[[A_ADDR]], align 4 +// LLVM: store { double, double } { double 0.000000e+00, double 3.000000e+00 }, ptr %[[B_ADDR]], align 8 +// LLVM: store { i32, i32 } { i32 0, i32 3 }, ptr %[[C_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8 +// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: store float 0.000000e+00, ptr %[[A_REAL_PTR]], align 4 +// OGCG: store float 3.000000e+00, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0 +// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1 +// OGCG: store double 0.000000e+00, ptr %[[B_REAL_PTR]], align 8 +// OGCG: store double 3.000000e+00, ptr %[[B_IMAG_PTR]], align 8 +// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0 +// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1 +// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4 +// OGCG: store i32 3, ptr %[[C_IMAG_PTR]], align 4 diff --git a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames index 329cea9a0adfb..2452ee345fe2f 100644 --- a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames +++ b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames @@ -1 +1,3 @@ -!world +v1 +f world +c 0 diff --git a/clang/test/CodeGen/basic-block-sections.c b/clang/test/CodeGen/basic-block-sections.c index a61b8dd4ac376..0c21a4cb1442c 100644 --- a/clang/test/CodeGen/basic-block-sections.c +++ b/clang/test/CodeGen/basic-block-sections.c @@ -30,8 +30,10 @@ int another(int a) { // // BB_WORLD: .section .text.world,"ax",@progbits{{$}} // BB_WORLD: world: -// BB_WORLD: .section .text.world,"ax",@progbits,unique -// BB_WORLD: world.__part.1: +// BB_ALL: .section .text.world,"ax",@progbits,unique +// BB_ALL: world.__part.1: +// BB_LIST: .section .text.split.world,"ax",@progbits +// BB_LIST: world.cold: // BB_ALL: .section .text.another,"ax",@progbits // BB_ALL: another.__part.1: // BB_LIST-NOT: .section .text.another,"ax",@progbits diff --git a/clang/test/OpenMP/force-usm.c b/clang/test/OpenMP/force-usm.c index 5c63a9a5e7004..45c0e28b525da 100644 --- a/clang/test/OpenMP/force-usm.c +++ b/clang/test/OpenMP/force-usm.c @@ -46,7 +46,7 @@ int main(void) { // CHECK-USM-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK-USM: user_code.entry: // CHECK-USM-NEXT: store i32 1, ptr [[TMP0]], align 4 -// CHECK-USM-NEXT: [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8 +// CHECK-USM-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(1) @pGI_decl_tgt_ref_ptr, align 8 // CHECK-USM-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8 // CHECK-USM-NEXT: store i32 2, ptr [[TMP3]], align 4 // CHECK-USM-NEXT: call void @__kmpc_target_deinit() diff --git a/clang/test/OpenMP/spirv_target_codegen_basic.cpp b/clang/test/OpenMP/spirv_target_codegen_basic.cpp index fb2810e88c063..6e029fb93644d 100644 --- a/clang/test/OpenMP/spirv_target_codegen_basic.cpp +++ b/clang/test/OpenMP/spirv_target_codegen_basic.cpp @@ -6,12 +6,18 @@ // CHECK: @__omp_offloading_{{.*}}_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer // CHECK: @__omp_offloading_{{.*}}_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy +// CHECK: @"_gomp_critical_user_$var" = common addrspace(1) global [8 x i32] zeroinitializer, align 8 + // CHECK: define weak_odr protected spir_kernel void @__omp_offloading_{{.*}} +// CHECK: call spir_func addrspace(9) void @__kmpc_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4))) +// CHECK: call spir_func addrspace(9) void @__kmpc_end_critical(ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}} to ptr addrspace(4)), i32 %{{.*}}, ptr addrspace(4) addrspacecast (ptr addrspace(1) @"_gomp_critical_user_$var" to ptr addrspace(4))) + int main() { int ret = 0; #pragma omp target for(int i = 0; i < 5; i++) + #pragma omp critical ret++; return ret; } diff --git a/clang/tools/clang-check/CMakeLists.txt b/clang/tools/clang-check/CMakeLists.txt index 5493aa4237aee..1efcc91fcaec9 100644 --- a/clang/tools/clang-check/CMakeLists.txt +++ b/clang/tools/clang-check/CMakeLists.txt @@ -14,6 +14,7 @@ clang_target_link_libraries(clang-check clangBasic clangDriver clangFrontend + clangOptions clangRewriteFrontend clangSerialization clangStaticAnalyzerFrontend diff --git a/clang/tools/clang-check/ClangCheck.cpp b/clang/tools/clang-check/ClangCheck.cpp index fa6dd06a1ee58..80255c647b98f 100644 --- a/clang/tools/clang-check/ClangCheck.cpp +++ b/clang/tools/clang-check/ClangCheck.cpp @@ -16,9 +16,9 @@ //===----------------------------------------------------------------------===// #include "clang/AST/ASTConsumer.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/ASTConsumers.h" #include "clang/Frontend/CompilerInstance.h" +#include "clang/Options/Options.h" #include "clang/Rewrite/Frontend/FixItRewriter.h" #include "clang/Rewrite/Frontend/FrontendActions.h" #include "clang/StaticAnalyzer/Frontend/FrontendActions.h" @@ -34,8 +34,8 @@ #include "llvm/Support/Signals.h" #include "llvm/Support/TargetSelect.h" -using namespace clang::driver; using namespace clang::tooling; +using namespace clang; using namespace llvm; static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage); diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt index 9c0d9dff7dc7f..54bc80486472f 100644 --- a/clang/tools/clang-installapi/CMakeLists.txt +++ b/clang/tools/clang-installapi/CMakeLists.txt @@ -25,6 +25,7 @@ clang_target_link_libraries(clang-installapi clangAST clangInstallAPI clangBasic + clangOptions clangDriver clangFrontend clangTooling diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp index 4e66485343b89..8bef9690ad855 100644 --- a/clang/tools/clang-installapi/ClangInstallAPI.cpp +++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp @@ -35,7 +35,7 @@ using namespace clang; using namespace clang::installapi; -using namespace clang::driver::options; +using namespace clang::options; using namespace llvm::opt; using namespace llvm::MachO; @@ -71,7 +71,7 @@ static bool runFrontend(StringRef ProgName, Twine Label, bool Verbose, static bool run(ArrayRef Args, const char *ProgName) { // Setup Diagnostics engine. DiagnosticOptions DiagOpts; - const llvm::opt::OptTable &ClangOpts = clang::driver::getDriverOptTable(); + const llvm::opt::OptTable &ClangOpts = getDriverOptTable(); unsigned MissingArgIndex, MissingArgCount; llvm::opt::InputArgList ParsedArgs = ClangOpts.ParseArgs( ArrayRef(Args).slice(1), MissingArgIndex, MissingArgCount); diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp index 64324a3f8b010..f484d6f33ad8f 100644 --- a/clang/tools/clang-installapi/Options.cpp +++ b/clang/tools/clang-installapi/Options.cpp @@ -26,8 +26,6 @@ using namespace llvm; using namespace llvm::opt; using namespace llvm::MachO; -namespace drv = clang::driver::options; - namespace clang { namespace installapi { @@ -109,7 +107,7 @@ getArgListFromJSON(const StringRef Input, llvm::opt::OptTable *Table, bool Options::processDriverOptions(InputArgList &Args) { // Handle inputs. - for (const StringRef Path : Args.getAllArgValues(drv::OPT_INPUT)) { + for (const StringRef Path : Args.getAllArgValues(options::OPT_INPUT)) { // Assume any input that is not a directory is a filelist. // InstallAPI does not accept multiple directories, so retain the last one. if (FM->getOptionalDirectoryRef(Path)) @@ -120,7 +118,7 @@ bool Options::processDriverOptions(InputArgList &Args) { // Handle output. SmallString OutputPath; - if (auto *Arg = Args.getLastArg(drv::OPT_o)) { + if (auto *Arg = Args.getLastArg(options::OPT_o)) { OutputPath = Arg->getValue(); if (OutputPath != "-") FM->makeAbsolutePath(OutputPath); @@ -132,10 +130,10 @@ bool Options::processDriverOptions(InputArgList &Args) { } // Do basic error checking first for mixing -target and -arch options. - auto *ArgArch = Args.getLastArgNoClaim(drv::OPT_arch); - auto *ArgTarget = Args.getLastArgNoClaim(drv::OPT_target); + auto *ArgArch = Args.getLastArgNoClaim(options::OPT_arch); + auto *ArgTarget = Args.getLastArgNoClaim(options::OPT_target); auto *ArgTargetVariant = - Args.getLastArgNoClaim(drv::OPT_darwin_target_variant); + Args.getLastArgNoClaim(options::OPT_darwin_target_variant); if (ArgArch && (ArgTarget || ArgTargetVariant)) { Diags->Report(clang::diag::err_drv_argument_not_allowed_with) << ArgArch->getAsString(Args) @@ -143,7 +141,7 @@ bool Options::processDriverOptions(InputArgList &Args) { return false; } - auto *ArgMinTargetOS = Args.getLastArgNoClaim(drv::OPT_mtargetos_EQ); + auto *ArgMinTargetOS = Args.getLastArgNoClaim(options::OPT_mtargetos_EQ); if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) { Diags->Report(clang::diag::err_drv_cannot_mix_options) << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args); @@ -152,7 +150,7 @@ bool Options::processDriverOptions(InputArgList &Args) { // Capture target triples first. if (ArgTarget) { - for (const Arg *A : Args.filtered(drv::OPT_target)) { + for (const Arg *A : Args.filtered(options::OPT_target)) { A->claim(); llvm::Triple TargetTriple(A->getValue()); Target TAPITarget = Target(TargetTriple); @@ -168,7 +166,7 @@ bool Options::processDriverOptions(InputArgList &Args) { // Capture target variants. DriverOpts.Zippered = ArgTargetVariant != nullptr; - for (Arg *A : Args.filtered(drv::OPT_darwin_target_variant)) { + for (Arg *A : Args.filtered(options::OPT_darwin_target_variant)) { A->claim(); Triple Variant(A->getValue()); if (Variant.getVendor() != Triple::Apple) { @@ -213,7 +211,7 @@ bool Options::processDriverOptions(InputArgList &Args) { DriverOpts.Targets[TAPIVariant] = Variant; } - DriverOpts.Verbose = Args.hasArgNoClaim(drv::OPT_v); + DriverOpts.Verbose = Args.hasArgNoClaim(options::OPT_v); return true; } @@ -407,7 +405,7 @@ bool Options::processOptionList(InputArgList &Args, bool Options::processLinkerOptions(InputArgList &Args) { // Handle required arguments. - if (const Arg *A = Args.getLastArg(drv::OPT_install__name)) + if (const Arg *A = Args.getLastArg(options::OPT_install__name)) LinkerOpts.InstallName = A->getValue(); if (LinkerOpts.InstallName.empty()) { Diags->Report(diag::err_no_install_name); @@ -415,28 +413,29 @@ bool Options::processLinkerOptions(InputArgList &Args) { } // Defaulted or optional arguments. - if (auto *Arg = Args.getLastArg(drv::OPT_current__version)) + if (auto *Arg = Args.getLastArg(options::OPT_current__version)) LinkerOpts.CurrentVersion.parse64(Arg->getValue()); - if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version)) + if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version)) LinkerOpts.CompatVersion.parse64(Arg->getValue()); - if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version)) + if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version)) LinkerOpts.CompatVersion.parse64(Arg->getValue()); - if (auto *Arg = Args.getLastArg(drv::OPT_umbrella)) + if (auto *Arg = Args.getLastArg(options::OPT_umbrella)) LinkerOpts.ParentUmbrella = Arg->getValue(); - LinkerOpts.IsDylib = Args.hasArg(drv::OPT_dynamiclib); + LinkerOpts.IsDylib = Args.hasArg(options::OPT_dynamiclib); - for (auto *Arg : Args.filtered(drv::OPT_alias_list)) { + for (auto *Arg : Args.filtered(options::OPT_alias_list)) { LinkerOpts.AliasLists.emplace_back(Arg->getValue()); Arg->claim(); } - LinkerOpts.AppExtensionSafe = Args.hasFlag( - drv::OPT_fapplication_extension, drv::OPT_fno_application_extension, - /*Default=*/LinkerOpts.AppExtensionSafe); + LinkerOpts.AppExtensionSafe = + Args.hasFlag(options::OPT_fapplication_extension, + options::OPT_fno_application_extension, + /*Default=*/LinkerOpts.AppExtensionSafe); if (::getenv("LD_NO_ENCRYPT") != nullptr) LinkerOpts.AppExtensionSafe = true; @@ -446,7 +445,7 @@ bool Options::processLinkerOptions(InputArgList &Args) { // Capture library paths. PathSeq LibraryPaths; - for (const Arg *A : Args.filtered(drv::OPT_L)) { + for (const Arg *A : Args.filtered(options::OPT_L)) { LibraryPaths.emplace_back(A->getValue()); A->claim(); } @@ -461,7 +460,7 @@ bool Options::processLinkerOptions(InputArgList &Args) { // invocations. bool Options::processFrontendOptions(InputArgList &Args) { // Capture language mode. - if (auto *A = Args.getLastArgNoClaim(drv::OPT_x)) { + if (auto *A = Args.getLastArgNoClaim(options::OPT_x)) { FEOpts.LangMode = llvm::StringSwitch(A->getValue()) .Case("c", clang::Language::C) .Case("c++", clang::Language::CXX) @@ -475,15 +474,15 @@ bool Options::processFrontendOptions(InputArgList &Args) { return false; } } - for (auto *A : Args.filtered(drv::OPT_ObjC, drv::OPT_ObjCXX)) { - if (A->getOption().matches(drv::OPT_ObjC)) + for (auto *A : Args.filtered(options::OPT_ObjC, options::OPT_ObjCXX)) { + if (A->getOption().matches(options::OPT_ObjC)) FEOpts.LangMode = clang::Language::ObjC; else FEOpts.LangMode = clang::Language::ObjCXX; } // Capture Sysroot. - if (const Arg *A = Args.getLastArgNoClaim(drv::OPT_isysroot)) { + if (const Arg *A = Args.getLastArgNoClaim(options::OPT_isysroot)) { SmallString Path(A->getValue()); FM->makeAbsolutePath(Path); if (!FM->getOptionalDirectoryRef(Path)) { @@ -502,13 +501,13 @@ bool Options::processFrontendOptions(InputArgList &Args) { } // Capture system frameworks for all platforms. - for (const Arg *A : Args.filtered(drv::OPT_iframework)) + for (const Arg *A : Args.filtered(options::OPT_iframework)) FEOpts.SystemFwkPaths.emplace_back(A->getValue(), std::optional{}); // Capture framework paths. PathSeq FrameworkPaths; - for (const Arg *A : Args.filtered(drv::OPT_F)) + for (const Arg *A : Args.filtered(options::OPT_F)) FrameworkPaths.emplace_back(A->getValue()); if (!FrameworkPaths.empty()) diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt index a9fa61f9f75ee..db9e0de9b59e0 100644 --- a/clang/tools/driver/CMakeLists.txt +++ b/clang/tools/driver/CMakeLists.txt @@ -63,6 +63,7 @@ clang_target_link_libraries(clang clangDriver clangFrontend clangFrontendTool + clangOptions clangSerialization ) diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp index 52cffa4ccbe1f..2aef75597fc5f 100644 --- a/clang/tools/driver/cc1_main.cpp +++ b/clang/tools/driver/cc1_main.cpp @@ -17,7 +17,6 @@ #include "clang/CodeGen/ObjectFilePCHContainerWriter.h" #include "clang/Config/config.h" #include "clang/Driver/DriverDiagnostic.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/CompilerInvocation.h" #include "clang/Frontend/FrontendDiagnostic.h" @@ -25,6 +24,7 @@ #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" #include "clang/FrontendTool/Utils.h" +#include "clang/Options/Options.h" #include "clang/Serialization/ObjectFilePCHContainerReader.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp index 50da2f8449a22..f13812f2a8383 100644 --- a/clang/tools/driver/cc1as_main.cpp +++ b/clang/tools/driver/cc1as_main.cpp @@ -14,10 +14,10 @@ #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticOptions.h" #include "clang/Driver/DriverDiagnostic.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" +#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" @@ -59,8 +59,7 @@ #include #include using namespace clang; -using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace llvm; using namespace llvm::opt; @@ -688,8 +687,7 @@ int cc1as_main(ArrayRef Argv, const char *Argv0, void *MainAddr) { getDriverOptTable().printHelp( llvm::outs(), "clang -cc1as [options] file...", "Clang Integrated Assembler", /*ShowHidden=*/false, - /*ShowAllAliases=*/false, - llvm::opt::Visibility(driver::options::CC1AsOption)); + /*ShowAllAliases=*/false, llvm::opt::Visibility(options::CC1AsOption)); return 0; } diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp index 6cf64d70c9399..709bb06ee2152 100644 --- a/clang/tools/driver/driver.cpp +++ b/clang/tools/driver/driver.cpp @@ -18,13 +18,13 @@ #include "clang/Config/config.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/DriverDiagnostic.h" -#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" #include "clang/Frontend/ChainedDiagnosticConsumer.h" #include "clang/Frontend/CompilerInvocation.h" #include "clang/Frontend/SerializedDiagnosticPrinter.h" #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" @@ -350,8 +350,8 @@ int clang_main(int Argc, char **Argv, const llvm::ToolContext &ToolContext) { unsigned NumParallelJobs = getLastArgIntValue(ArgList, options::OPT_parallel_jobs_EQ, 1, Diags); UseNewCC1Process = - ArgList.hasFlag(clang::driver::options::OPT_fno_integrated_cc1, - clang::driver::options::OPT_fintegrated_cc1, + ArgList.hasFlag(options::OPT_fno_integrated_cc1, + options::OPT_fintegrated_cc1, /*Default=*/NumParallelJobs > 1 ? true : CLANG_SPAWN_CC1); if (!DiagOpts->DiagnosticSerializationFile.empty()) { diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp index 62274235c53f5..e0454f190b35a 100644 --- a/clang/unittests/Driver/DXCModeTest.cpp +++ b/clang/unittests/Driver/DXCModeTest.cpp @@ -131,8 +131,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) { TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None)}; EXPECT_NE(TranslatedArgs, nullptr); if (TranslatedArgs) { - auto *A = TranslatedArgs->getLastArg( - clang::driver::options::OPT_dxil_validator_version); + auto *A = + TranslatedArgs->getLastArg(clang::options::OPT_dxil_validator_version); EXPECT_NE(A, nullptr); if (A) { EXPECT_STREQ(A->getValue(), "1.1"); diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html index ae0ec1d4d12cb..3e5e84b5b2ed4 100755 --- a/clang/www/OpenProjects.html +++ b/clang/www/OpenProjects.html @@ -38,7 +38,7 @@

Open Clang Projects

  • documenting diagnostic group flags (adding code examples of what is diagnosed, or other relevant information), or
  • -
  • documenting +
  • documenting command line options, or
  • help with completing other missing documentation.
  • diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index ffe9554203241..2d0f4f53d2ac5 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -171,8 +171,7 @@ class Allocator { Primary.Options.set(OptionBit::DeallocTypeMismatch); if (getFlags()->delete_size_mismatch) Primary.Options.set(OptionBit::DeleteSizeMismatch); - if (allocatorSupportsMemoryTagging() && - systemSupportsMemoryTagging()) + if (systemSupportsMemoryTagging()) Primary.Options.set(OptionBit::UseMemoryTagging); QuarantineMaxChunkSize = diff --git a/compiler-rt/lib/scudo/standalone/memtag.h b/compiler-rt/lib/scudo/standalone/memtag.h index 83ebe676433eb..52897b3dca6ae 100644 --- a/compiler-rt/lib/scudo/standalone/memtag.h +++ b/compiler-rt/lib/scudo/standalone/memtag.h @@ -66,7 +66,8 @@ inline bool systemSupportsMemoryTagging() { #ifndef HWCAP2_MTE #define HWCAP2_MTE (1 << 18) #endif - return getauxval(AT_HWCAP2) & HWCAP2_MTE; + static bool SupportsMemoryTagging = getauxval(AT_HWCAP2) & HWCAP2_MTE; + return SupportsMemoryTagging; } inline bool systemDetectsMemoryTagFaultsTestOnly() { @@ -261,9 +262,7 @@ inline uptr loadTag(uptr Ptr) { #else -inline NORETURN bool systemSupportsMemoryTagging() { - UNREACHABLE("memory tagging not supported"); -} +inline bool systemSupportsMemoryTagging() { return false; } inline NORETURN bool systemDetectsMemoryTagFaultsTestOnly() { UNREACHABLE("memory tagging not supported"); diff --git a/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp b/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp index 09093e11452dd..d0d93316f212e 100644 --- a/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp @@ -28,7 +28,6 @@ TEST(MemtagBasicDeathTest, Unsupported) { EXPECT_DEATH(untagPointer((uptr)0), "not supported"); EXPECT_DEATH(extractTag((uptr)0), "not supported"); - EXPECT_DEATH(systemSupportsMemoryTagging(), "not supported"); EXPECT_DEATH(systemDetectsMemoryTagFaultsTestOnly(), "not supported"); EXPECT_DEATH(enableSystemMemoryTaggingTestOnly(), "not supported"); diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp index 855a3e6e6109f..8741c8299b57c 100644 --- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp @@ -27,9 +27,7 @@ const scudo::uptr PageSize = scudo::getPageSizeCached(); template static scudo::Options getOptionsForConfig() { - if (!Config::getMaySupportMemoryTagging() || - !scudo::archSupportsMemoryTagging() || - !scudo::systemSupportsMemoryTagging()) + if (!scudo::systemSupportsMemoryTagging()) return {}; scudo::AtomicOptions AO; AO.set(scudo::OptionBit::UseMemoryTagging); diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp index c802ed22fbad0..84359251a02aa 100644 --- a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp @@ -187,8 +187,7 @@ TEST_F(ScudoWrappersCppTest, ThreadedNew) { // TODO: Investigate why libc sometimes crashes with tag missmatch in // __pthread_clockjoin_ex. std::unique_ptr NoTags; - if (!SCUDO_ANDROID && scudo::archSupportsMemoryTagging() && - scudo::systemSupportsMemoryTagging()) + if (!SCUDO_ANDROID && scudo::systemSupportsMemoryTagging()) NoTags = std::make_unique(); Ready = false; diff --git a/flang/docs/CMakeLists.txt b/flang/docs/CMakeLists.txt index 568f942cb4aa6..b183d6add1059 100644 --- a/flang/docs/CMakeLists.txt +++ b/flang/docs/CMakeLists.txt @@ -88,7 +88,7 @@ function (gen_rst_file_from_td output_file td_option source target) endif() get_filename_component(TABLEGEN_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${source}" DIRECTORY) list(APPEND LLVM_TABLEGEN_FLAGS "-I${TABLEGEN_INCLUDE_DIR}") - list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Driver/") + list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Options/") clang_tablegen(Source/${output_file} ${td_option} SOURCE ${source} TARGET ${target}) endfunction() diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md index 3286171bb1499..9953f2252218b 100644 --- a/flang/docs/FlangDriver.md +++ b/flang/docs/FlangDriver.md @@ -76,7 +76,7 @@ will ignore it when used without `Xflang`. As hinted above, `flang` and `flang -fc1` are two separate tools. The fact that these tools are accessed through one binary, `flang`, is just an implementation detail. Each tool has a separate list of options, albeit defined -in the same file: `clang/include/clang/Driver/Options.td`. +in the same file: `clang/include/clang/Options/Options.td`. The separation helps us split various tasks and allows us to implement more specialised tools. In particular, `flang` is not aware of various @@ -112,7 +112,7 @@ in terms of Clang's driver library, `clangDriver`. This approach allows us to: as linkers and assemblers. One implication of this dependency on Clang is that all of Flang's compiler options are defined alongside Clang's options in -`clang/include/clang/Driver/Options.td`. For options that are common for both +`clang/include/clang/Options/Options.td`. For options that are common for both Flang and Clang, the corresponding definitions are shared. Internally, a `clangDriver` based compiler driver works by creating actions @@ -242,7 +242,7 @@ Adding a new compiler option in Flang consists of two steps: ### Option Definition All of Flang's compiler and frontend driver options are defined in -`clang/include/clang/Driver/Options.td` in Clang. When adding a new option to +`clang/include/clang/Options/Options.td` in Clang. When adding a new option to Flang, you will either: * extend the existing definition for an option that is already available in one of Clang's drivers (e.g. `clang`), but not yet available in Flang, or @@ -314,7 +314,7 @@ add, you will have to add a dedicated entry in that enum (e.g. `ParseSyntaxOnly` for `-fsyntax-only`) and a corresponding `case` in `ParseFrontendArgs` function in the `CompilerInvocation.cpp` file, e.g.: ```cpp - case clang::driver::options::OPT_fsyntax_only: + case clang::options::OPT_fsyntax_only: opts.programAction = ParseSyntaxOnly; break; ``` diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h index 5c56dd6b695f8..6e2442745f9a0 100644 --- a/flang/include/flang/Optimizer/Builder/CUFCommon.h +++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h @@ -18,6 +18,7 @@ static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem"; namespace fir { class FirOpBuilder; +class KindMapping; } // namespace fir namespace cuf { @@ -34,6 +35,10 @@ bool isRegisteredDeviceAttr(std::optional attr); void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder); +int computeElementByteSize(mlir::Location loc, mlir::Type type, + fir::KindMapping &kindMap, + bool emitErrorOnFailure = true); + } // namespace cuf #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_ diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index bae52d63fda45..289c79bd9b831 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -80,8 +80,7 @@ def AnyRefOfConstantSizeAggregateType : TypeConstraint< // Memory SSA operations //===----------------------------------------------------------------------===// -def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, - MemoryEffects<[MemAlloc]>]> { +def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments]> { let summary = "allocate storage for a temporary on the stack given a type"; let description = [{ This primitive operation is used to allocate an object on the stack. A @@ -162,7 +161,9 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, Variadic:$shape ); - let results = (outs fir_ReferenceType); + let results = + (outs Res]>:$res); let hasCustomAssemblyFormat = 1; let hasVerifier = 1; @@ -212,8 +213,7 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, }]; } -def fir_AllocMemOp : fir_Op<"allocmem", - [MemoryEffects<[MemAlloc]>, AttrSizedOperandSegments]> { +def fir_AllocMemOp : fir_Op<"allocmem", [AttrSizedOperandSegments]> { let summary = "allocate storage on the heap for an object of a given type"; let description = [{ @@ -235,7 +235,7 @@ def fir_AllocMemOp : fir_Op<"allocmem", Variadic:$typeparams, Variadic:$shape ); - let results = (outs fir_HeapType); + let results = (outs Res]>:$res); let hasCustomAssemblyFormat = 1; let hasVerifier = 1; diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt index 2b3bc0e9c2269..bb0b4a39cec9b 100644 --- a/flang/lib/Frontend/CMakeLists.txt +++ b/flang/lib/Frontend/CMakeLists.txt @@ -76,6 +76,7 @@ add_flang_library(flangFrontend CLANG_LIBS clangBasic clangDriver + clangOptions ) target_precompile_headers(flangFrontend PRIVATE diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 5eba5e4cc8a53..afb8c752cedc3 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -26,8 +26,8 @@ #include "clang/Basic/DiagnosticOptions.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/OptionUtils.h" -#include "clang/Driver/Options.h" +#include "clang/Options/OptionUtils.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -50,7 +50,7 @@ #include using namespace Fortran::frontend; - +using namespace clang::options; //===----------------------------------------------------------------------===// // Initialization. //===----------------------------------------------------------------------===// @@ -82,11 +82,11 @@ static bool parseShowColorsArgs(const llvm::opt::ArgList &args, for (auto *a : args) { const llvm::opt::Option &opt = a->getOption(); - if (opt.matches(clang::driver::options::OPT_fcolor_diagnostics)) { + if (opt.matches(clang::options::OPT_fcolor_diagnostics)) { showColors = Colors_On; - } else if (opt.matches(clang::driver::options::OPT_fno_color_diagnostics)) { + } else if (opt.matches(clang::options::OPT_fno_color_diagnostics)) { showColors = Colors_Off; - } else if (opt.matches(clang::driver::options::OPT_fdiagnostics_color_EQ)) { + } else if (opt.matches(clang::options::OPT_fdiagnostics_color_EQ)) { llvm::StringRef value(a->getValue()); if (value == "always") showColors = Colors_On; @@ -107,15 +107,13 @@ static unsigned getOptimizationLevel(llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { unsigned defaultOpt = 0; - if (llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_O_Group)) { - if (a->getOption().matches(clang::driver::options::OPT_O0)) + if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_O_Group)) { + if (a->getOption().matches(clang::options::OPT_O0)) return 0; - assert(a->getOption().matches(clang::driver::options::OPT_O)); + assert(a->getOption().matches(clang::options::OPT_O)); - return getLastArgIntValue(args, clang::driver::options::OPT_O, defaultOpt, - diags); + return getLastArgIntValue(args, clang::options::OPT_O, defaultOpt, diags); } return defaultOpt; @@ -133,7 +131,7 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, clang::DiagnosticsEngine &diags) { using DebugInfoKind = llvm::codegenoptions::DebugInfoKind; if (llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_debug_info_kind_EQ)) { + args.getLastArg(clang::options::OPT_debug_info_kind_EQ)) { std::optional val = llvm::StringSwitch>(arg->getValue()) .Case("line-tables-only", llvm::codegenoptions::DebugLineTablesOnly) @@ -158,13 +156,13 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, diags.Report(debugWarning) << arg->getValue(); } opts.DwarfVersion = - getLastArgIntValue(args, clang::driver::options::OPT_dwarf_version_EQ, + getLastArgIntValue(args, clang::options::OPT_dwarf_version_EQ, /*Default=*/0, diags); if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_split_dwarf_file)) + args.getLastArg(clang::options::OPT_split_dwarf_file)) opts.SplitDwarfFile = a->getValue(); if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_split_dwarf_output)) + args.getLastArg(clang::options::OPT_split_dwarf_output)) opts.SplitDwarfOutput = a->getValue(); } return true; @@ -174,7 +172,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_fdo_concurrent_to_openmp_EQ); + args.getLastArg(clang::options::OPT_fdo_concurrent_to_openmp_EQ); if (!arg) return; @@ -199,7 +197,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fveclib); + llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fveclib); if (!arg) return true; @@ -237,7 +235,7 @@ parseOptimizationRemark(clang::DiagnosticsEngine &diags, CodeGenOptions::OptRemark result; for (llvm::opt::Arg *a : args) { - if (a->getOption().matches(clang::driver::options::OPT_R_Joined)) { + if (a->getOption().matches(clang::options::OPT_R_Joined)) { llvm::StringRef value = a->getValue(); if (value == remarkOptName) { @@ -274,39 +272,39 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::DiagnosticsEngine &diags) { opts.OptimizationLevel = getOptimizationLevel(args, diags); - if (args.hasFlag(clang::driver::options::OPT_fdebug_pass_manager, - clang::driver::options::OPT_fno_debug_pass_manager, false)) + if (args.hasFlag(clang::options::OPT_fdebug_pass_manager, + clang::options::OPT_fno_debug_pass_manager, false)) opts.DebugPassManager = 1; - if (args.hasFlag(clang::driver::options::OPT_fstack_arrays, - clang::driver::options::OPT_fno_stack_arrays, false)) + if (args.hasFlag(clang::options::OPT_fstack_arrays, + clang::options::OPT_fno_stack_arrays, false)) opts.StackArrays = 1; - if (args.getLastArg(clang::driver::options::OPT_floop_interchange)) + if (args.getLastArg(clang::options::OPT_floop_interchange)) opts.InterchangeLoops = 1; - if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion)) + if (args.getLastArg(clang::options::OPT_fexperimental_loop_fusion)) opts.FuseLoops = 1; - if (args.getLastArg(clang::driver::options::OPT_vectorize_loops)) + if (args.getLastArg(clang::options::OPT_vectorize_loops)) opts.VectorizeLoop = 1; - if (args.getLastArg(clang::driver::options::OPT_vectorize_slp)) + if (args.getLastArg(clang::options::OPT_vectorize_slp)) opts.VectorizeSLP = 1; - if (args.hasFlag(clang::driver::options::OPT_floop_versioning, - clang::driver::options::OPT_fno_loop_versioning, false)) + if (args.hasFlag(clang::options::OPT_floop_versioning, + clang::options::OPT_fno_loop_versioning, false)) opts.LoopVersioning = 1; - opts.UnrollLoops = args.hasFlag(clang::driver::options::OPT_funroll_loops, - clang::driver::options::OPT_fno_unroll_loops, + opts.UnrollLoops = args.hasFlag(clang::options::OPT_funroll_loops, + clang::options::OPT_fno_unroll_loops, (opts.OptimizationLevel > 1)); opts.AliasAnalysis = opts.OptimizationLevel > 0; // -mframe-pointer=none/non-leaf/reserved/all option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mframe_pointer_EQ)) { + args.getLastArg(clang::options::OPT_mframe_pointer_EQ)) { std::optional val = llvm::StringSwitch>(a->getValue()) .Case("none", llvm::FramePointerKind::None) @@ -322,7 +320,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, opts.setFramePointer(val.value()); } - for (auto *a : args.filtered(clang::driver::options::OPT_fpass_plugin_EQ)) + for (auto *a : args.filtered(clang::options::OPT_fpass_plugin_EQ)) opts.LLVMPassPlugins.push_back(a->getValue()); opts.Reciprocals = clang::driver::tools::parseMRecipOption(diags, args); @@ -331,15 +329,14 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::driver::tools::parseMPreferVectorWidthOption(diags, args); // -fembed-offload-object option - for (auto *a : - args.filtered(clang::driver::options::OPT_fembed_offload_object_EQ)) + for (auto *a : args.filtered(clang::options::OPT_fembed_offload_object_EQ)) opts.OffloadObjects.push_back(a->getValue()); - if (args.hasArg(clang::driver::options::OPT_finstrument_functions)) + if (args.hasArg(clang::options::OPT_finstrument_functions)) opts.InstrumentFunctions = 1; - if (const llvm::opt::Arg *a = args.getLastArg( - clang::driver::options::OPT_mcode_object_version_EQ)) { + if (const llvm::opt::Arg *a = + args.getLastArg(clang::options::OPT_mcode_object_version_EQ)) { llvm::StringRef s = a->getValue(); if (s == "6") opts.CodeObjectVersion = llvm::CodeObjectVersionKind::COV_6; @@ -353,36 +350,36 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, // -f[no-]save-optimization-record[=] if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_opt_record_file)) + args.getLastArg(clang::options::OPT_opt_record_file)) opts.OptRecordFile = a->getValue(); // Optimization file format. Defaults to yaml if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_opt_record_format)) + args.getLastArg(clang::options::OPT_opt_record_format)) opts.OptRecordFormat = a->getValue(); // Specifies, using a regex, which successful optimization passes(middle and // backend), to include in the final optimization record file generated. If // not provided -fsave-optimization-record will include all passes. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_opt_record_passes)) + args.getLastArg(clang::options::OPT_opt_record_passes)) opts.OptRecordPasses = a->getValue(); // Create OptRemark that allows printing of all successful optimization // passes applied. opts.OptimizationRemark = - parseOptimizationRemark(diags, args, clang::driver::options::OPT_Rpass_EQ, + parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_EQ, /*remarkOptName=*/"pass"); // Create OptRemark that allows all missed optimization passes to be printed. - opts.OptimizationRemarkMissed = parseOptimizationRemark( - diags, args, clang::driver::options::OPT_Rpass_missed_EQ, - /*remarkOptName=*/"pass-missed"); + opts.OptimizationRemarkMissed = + parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_missed_EQ, + /*remarkOptName=*/"pass-missed"); // Create OptRemark that allows all optimization decisions made by LLVM // to be printed. opts.OptimizationRemarkAnalysis = parseOptimizationRemark( - diags, args, clang::driver::options::OPT_Rpass_analysis_EQ, + diags, args, clang::options::OPT_Rpass_analysis_EQ, /*remarkOptName=*/"pass-analysis"); if (opts.getDebugInfo() == llvm::codegenoptions::NoDebugInfo) { @@ -400,23 +397,22 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, opts.setDebugInfo(llvm::codegenoptions::LocTrackingOnly); } - if (auto *a = args.getLastArg(clang::driver::options::OPT_save_temps_EQ)) + if (auto *a = args.getLastArg(clang::options::OPT_save_temps_EQ)) opts.SaveTempsDir = a->getValue(); // -record-command-line option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_record_command_line)) { + args.getLastArg(clang::options::OPT_record_command_line)) { opts.RecordCommandLine = a->getValue(); } // -mlink-builtin-bitcode - for (auto *a : - args.filtered(clang::driver::options::OPT_mlink_builtin_bitcode)) + for (auto *a : args.filtered(clang::options::OPT_mlink_builtin_bitcode)) opts.BuiltinBCLibs.push_back(a->getValue()); // -mrelocation-model option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mrelocation_model)) { + args.getLastArg(clang::options::OPT_mrelocation_model)) { llvm::StringRef modelName = a->getValue(); auto relocModel = llvm::StringSwitch>(modelName) @@ -435,31 +431,30 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, } // -pic-level and -pic-is-pie option. - if (int picLevel = getLastArgIntValue( - args, clang::driver::options::OPT_pic_level, 0, diags)) { + if (int picLevel = + getLastArgIntValue(args, clang::options::OPT_pic_level, 0, diags)) { if (picLevel > 2) diags.Report(clang::diag::err_drv_invalid_value) - << args.getLastArg(clang::driver::options::OPT_pic_level) - ->getAsString(args) + << args.getLastArg(clang::options::OPT_pic_level)->getAsString(args) << picLevel; opts.PICLevel = picLevel; - if (args.hasArg(clang::driver::options::OPT_pic_is_pie)) + if (args.hasArg(clang::options::OPT_pic_is_pie)) opts.IsPIE = 1; } - if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) { + if (args.hasArg(clang::options::OPT_fprofile_generate)) { opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr); } - if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) { + if (auto A = args.getLastArg(clang::options::OPT_fprofile_use_EQ)) { opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr); opts.ProfileInstrumentUsePath = A->getValue(); } // -mcmodel option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) { + args.getLastArg(clang::options::OPT_mcmodel_EQ)) { llvm::StringRef modelName = a->getValue(); std::optional codeModel = getCodeModel(modelName); @@ -470,8 +465,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, << a->getAsString(args) << modelName; } - if (const llvm::opt::Arg *arg = args.getLastArg( - clang::driver::options::OPT_mlarge_data_threshold_EQ)) { + if (const llvm::opt::Arg *arg = + args.getLastArg(clang::options::OPT_mlarge_data_threshold_EQ)) { uint64_t LDT; if (llvm::StringRef(arg->getValue()).getAsInteger(/*Radix=*/10, LDT)) { diags.Report(clang::diag::err_drv_invalid_value) @@ -481,13 +476,13 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, } // This option is compatible with -f[no-]underscoring in gfortran. - if (args.hasFlag(clang::driver::options::OPT_fno_underscoring, - clang::driver::options::OPT_funderscoring, false)) { + if (args.hasFlag(clang::options::OPT_fno_underscoring, + clang::options::OPT_funderscoring, false)) { opts.Underscoring = 0; } - if (args.hasFlag(clang::driver::options::OPT_foffload_global_filtering, - clang::driver::options::OPT_fno_offload_global_filtering, + if (args.hasFlag(clang::options::OPT_foffload_global_filtering, + clang::options::OPT_fno_offload_global_filtering, false)) { opts.OffloadGlobalFiltering = 1; } @@ -495,11 +490,11 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, parseDoConcurrentMapping(opts, args, diags); opts.DeferDescriptorMapping = - args.hasFlag(clang::driver::options::OPT_fdefer_desc_map, - clang::driver::options::OPT_fno_defer_desc_map, true); + args.hasFlag(clang::options::OPT_fdefer_desc_map, + clang::options::OPT_fno_defer_desc_map, true); if (const llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_complex_range_EQ)) { + args.getLastArg(clang::options::OPT_complex_range_EQ)) { llvm::StringRef argValue = llvm::StringRef(arg->getValue()); if (argValue == "full") { opts.setComplexRange(CodeGenOptions::ComplexRangeKind::CX_Full); @@ -520,46 +515,42 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, /// \param [in] opts The target options instance to update /// \param [in] args The list of input arguments (from the compiler invocation) static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_triple)) + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_triple)) opts.triple = a->getValue(); - opts.atomicIgnoreDenormalMode = args.hasFlag( - clang::driver::options::OPT_fatomic_ignore_denormal_mode, - clang::driver::options::OPT_fno_atomic_ignore_denormal_mode, false); - opts.atomicFineGrainedMemory = args.hasFlag( - clang::driver::options::OPT_fatomic_fine_grained_memory, - clang::driver::options::OPT_fno_atomic_fine_grained_memory, false); + opts.atomicIgnoreDenormalMode = + args.hasFlag(clang::options::OPT_fatomic_ignore_denormal_mode, + clang::options::OPT_fno_atomic_ignore_denormal_mode, false); + opts.atomicFineGrainedMemory = + args.hasFlag(clang::options::OPT_fatomic_fine_grained_memory, + clang::options::OPT_fno_atomic_fine_grained_memory, false); opts.atomicRemoteMemory = - args.hasFlag(clang::driver::options::OPT_fatomic_remote_memory, - clang::driver::options::OPT_fno_atomic_remote_memory, false); + args.hasFlag(clang::options::OPT_fatomic_remote_memory, + clang::options::OPT_fno_atomic_remote_memory, false); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_target_cpu)) + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_target_cpu)) opts.cpu = a->getValue(); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_tune_cpu)) + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_tune_cpu)) opts.cpuToTuneFor = a->getValue(); for (const llvm::opt::Arg *currentArg : - args.filtered(clang::driver::options::OPT_target_feature)) + args.filtered(clang::options::OPT_target_feature)) opts.featuresAsWritten.emplace_back(currentArg->getValue()); - if (args.hasArg(clang::driver::options::OPT_fdisable_real_10)) + if (args.hasArg(clang::options::OPT_fdisable_real_10)) opts.disabledRealKinds.push_back(10); - if (args.hasArg(clang::driver::options::OPT_fdisable_real_3)) + if (args.hasArg(clang::options::OPT_fdisable_real_3)) opts.disabledRealKinds.push_back(3); - if (args.hasArg(clang::driver::options::OPT_fdisable_integer_2)) + if (args.hasArg(clang::options::OPT_fdisable_integer_2)) opts.disabledIntegerKinds.push_back(2); - if (args.hasArg(clang::driver::options::OPT_fdisable_integer_16)) + if (args.hasArg(clang::options::OPT_fdisable_integer_16)) opts.disabledIntegerKinds.push_back(16); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mabi_EQ)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_mabi_EQ)) { opts.abi = a->getValue(); llvm::StringRef V = a->getValue(); if (V == "vec-extabi") { @@ -569,9 +560,8 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { } } - opts.asmVerbose = - args.hasFlag(clang::driver::options::OPT_fverbose_asm, - clang::driver::options::OPT_fno_verbose_asm, false); + opts.asmVerbose = args.hasFlag(clang::options::OPT_fverbose_asm, + clang::options::OPT_fno_verbose_asm, false); } // Tweak the frontend configuration based on the frontend action static void setUpFrontendBasedOnAction(FrontendOptions &opts) { @@ -604,11 +594,11 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Treat multiple action options as an invocation error. Note that `clang // -cc1` does accept multiple action options, but will only consider the // rightmost one. - if (args.hasMultipleArgs(clang::driver::options::OPT_Action_Group)) { + if (args.hasMultipleArgs(clang::options::OPT_Action_Group)) { llvm::SmallString<32> buf; llvm::raw_svector_ostream os(buf); for (const llvm::opt::Arg *arg : - args.filtered(clang::driver::options::OPT_Action_Group)) { + args.filtered(clang::options::OPT_Action_Group)) { if (buf.size()) os << ", "; os << "'" << arg->getSpelling() << "'"; @@ -619,99 +609,99 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Identify the action (i.e. opts.ProgramAction) if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_Action_Group)) { + args.getLastArg(clang::options::OPT_Action_Group)) { switch (a->getOption().getID()) { default: { llvm_unreachable("Invalid option in group!"); } - case clang::driver::options::OPT_test_io: + case clang::options::OPT_test_io: opts.programAction = InputOutputTest; break; - case clang::driver::options::OPT_E: + case clang::options::OPT_E: opts.programAction = PrintPreprocessedInput; break; - case clang::driver::options::OPT_fsyntax_only: + case clang::options::OPT_fsyntax_only: opts.programAction = ParseSyntaxOnly; break; - case clang::driver::options::OPT_emit_fir: + case clang::options::OPT_emit_fir: opts.programAction = EmitFIR; break; - case clang::driver::options::OPT_emit_hlfir: + case clang::options::OPT_emit_hlfir: opts.programAction = EmitHLFIR; break; - case clang::driver::options::OPT_emit_llvm: + case clang::options::OPT_emit_llvm: opts.programAction = EmitLLVM; break; - case clang::driver::options::OPT_emit_llvm_bc: + case clang::options::OPT_emit_llvm_bc: opts.programAction = EmitLLVMBitcode; break; - case clang::driver::options::OPT_emit_obj: + case clang::options::OPT_emit_obj: opts.programAction = EmitObj; break; - case clang::driver::options::OPT_S: + case clang::options::OPT_S: opts.programAction = EmitAssembly; break; - case clang::driver::options::OPT_fdebug_unparse: + case clang::options::OPT_fdebug_unparse: opts.programAction = DebugUnparse; break; - case clang::driver::options::OPT_fdebug_unparse_no_sema: + case clang::options::OPT_fdebug_unparse_no_sema: opts.programAction = DebugUnparseNoSema; break; - case clang::driver::options::OPT_fdebug_unparse_with_symbols: + case clang::options::OPT_fdebug_unparse_with_symbols: opts.programAction = DebugUnparseWithSymbols; break; - case clang::driver::options::OPT_fdebug_unparse_with_modules: + case clang::options::OPT_fdebug_unparse_with_modules: opts.programAction = DebugUnparseWithModules; break; - case clang::driver::options::OPT_fdebug_dump_symbols: + case clang::options::OPT_fdebug_dump_symbols: opts.programAction = DebugDumpSymbols; break; - case clang::driver::options::OPT_fdebug_dump_parse_tree: + case clang::options::OPT_fdebug_dump_parse_tree: opts.programAction = DebugDumpParseTree; break; - case clang::driver::options::OPT_fdebug_dump_pft: + case clang::options::OPT_fdebug_dump_pft: opts.programAction = DebugDumpPFT; break; - case clang::driver::options::OPT_fdebug_dump_all: + case clang::options::OPT_fdebug_dump_all: opts.programAction = DebugDumpAll; break; - case clang::driver::options::OPT_fdebug_dump_parse_tree_no_sema: + case clang::options::OPT_fdebug_dump_parse_tree_no_sema: opts.programAction = DebugDumpParseTreeNoSema; break; - case clang::driver::options::OPT_fdebug_dump_provenance: + case clang::options::OPT_fdebug_dump_provenance: opts.programAction = DebugDumpProvenance; break; - case clang::driver::options::OPT_fdebug_dump_parsing_log: + case clang::options::OPT_fdebug_dump_parsing_log: opts.programAction = DebugDumpParsingLog; break; - case clang::driver::options::OPT_fdebug_measure_parse_tree: + case clang::options::OPT_fdebug_measure_parse_tree: opts.programAction = DebugMeasureParseTree; break; - case clang::driver::options::OPT_fdebug_pre_fir_tree: + case clang::options::OPT_fdebug_pre_fir_tree: opts.programAction = DebugPreFIRTree; break; - case clang::driver::options::OPT_fget_symbols_sources: + case clang::options::OPT_fget_symbols_sources: opts.programAction = GetSymbolsSources; break; - case clang::driver::options::OPT_fget_definition: + case clang::options::OPT_fget_definition: opts.programAction = GetDefinition; break; - case clang::driver::options::OPT_init_only: + case clang::options::OPT_init_only: opts.programAction = InitOnly; break; // TODO: - // case clang::driver::options::OPT_emit_llvm: - // case clang::driver::options::OPT_emit_llvm_only: - // case clang::driver::options::OPT_emit_codegen_only: - // case clang::driver::options::OPT_emit_module: + // case clang::options::OPT_emit_llvm: + // case clang::options::OPT_emit_llvm_only: + // case clang::options::OPT_emit_codegen_only: + // case clang::options::OPT_emit_module: // (...) } // Parse the values provided with `-fget-definition` (there should be 3 // integers) if (llvm::opt::OptSpecifier(a->getOption().getID()) == - clang::driver::options::OPT_fget_definition) { + clang::options::OPT_fget_definition) { unsigned optVals[3] = {0, 0, 0}; for (unsigned i = 0; i < 3; i++) { @@ -731,27 +721,25 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Parsing -load option and storing shared object path - if (llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_load)) { + if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_load)) { opts.plugins.push_back(a->getValue()); } // Parsing -plugin option and storing plugin name and setting action - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_plugin)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_plugin)) { opts.programAction = PluginAction; opts.actionName = a->getValue(); } - opts.outputFile = args.getLastArgValue(clang::driver::options::OPT_o); - opts.showHelp = args.hasArg(clang::driver::options::OPT_help); - opts.showVersion = args.hasArg(clang::driver::options::OPT_version); + opts.outputFile = args.getLastArgValue(clang::options::OPT_o); + opts.showHelp = args.hasArg(clang::options::OPT_help); + opts.showVersion = args.hasArg(clang::options::OPT_version); opts.printSupportedCPUs = - args.hasArg(clang::driver::options::OPT_print_supported_cpus); + args.hasArg(clang::options::OPT_print_supported_cpus); // Get the input kind (from the value passed via `-x`) InputKind dashX(Language::Unknown); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_x)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_x)) { llvm::StringRef xValue = a->getValue(); // Principal languages. dashX = llvm::StringSwitch(xValue) @@ -778,7 +766,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Collect the input files and save them in our instance of FrontendOptions. std::vector inputs = - args.getAllArgValues(clang::driver::options::OPT_INPUT); + args.getAllArgValues(clang::options::OPT_INPUT); opts.inputs.clear(); if (inputs.empty()) // '-' is the default input if none is given. @@ -798,18 +786,16 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Set fortranForm based on options -ffree-form and -ffixed-form. - if (const auto *arg = - args.getLastArg(clang::driver::options::OPT_ffixed_form, - clang::driver::options::OPT_ffree_form)) { - opts.fortranForm = - arg->getOption().matches(clang::driver::options::OPT_ffixed_form) - ? FortranForm::FixedForm - : FortranForm::FreeForm; + if (const auto *arg = args.getLastArg(clang::options::OPT_ffixed_form, + clang::options::OPT_ffree_form)) { + opts.fortranForm = arg->getOption().matches(clang::options::OPT_ffixed_form) + ? FortranForm::FixedForm + : FortranForm::FreeForm; } // Set fixedFormColumns based on -ffixed-line-length= if (const auto *arg = - args.getLastArg(clang::driver::options::OPT_ffixed_line_length_EQ)) { + args.getLastArg(clang::options::OPT_ffixed_line_length_EQ)) { llvm::StringRef argValue = llvm::StringRef(arg->getValue()); std::int64_t columns = -1; if (argValue == "none") { @@ -831,8 +817,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Set conversion based on -fconvert= - if (const auto *arg = - args.getLastArg(clang::driver::options::OPT_fconvert_EQ)) { + if (const auto *arg = args.getLastArg(clang::options::OPT_fconvert_EQ)) { const char *argValue = arg->getValue(); if (auto convert = parseConvertArg(argValue)) opts.envDefaults.push_back({"FORT_CONVERT", *convert}); @@ -842,65 +827,61 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // -f{no-}implicit-none - opts.features.Enable( - Fortran::common::LanguageFeature::ImplicitNoneTypeAlways, - args.hasFlag(clang::driver::options::OPT_fimplicit_none, - clang::driver::options::OPT_fno_implicit_none, false)); + opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneTypeAlways, + args.hasFlag(clang::options::OPT_fimplicit_none, + clang::options::OPT_fno_implicit_none, + false)); // -f{no-}implicit-none-ext - opts.features.Enable( - Fortran::common::LanguageFeature::ImplicitNoneExternal, - args.hasFlag(clang::driver::options::OPT_fimplicit_none_ext, - clang::driver::options::OPT_fno_implicit_none_ext, false)); + opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneExternal, + args.hasFlag(clang::options::OPT_fimplicit_none_ext, + clang::options::OPT_fno_implicit_none_ext, + false)); // -f{no-}backslash opts.features.Enable(Fortran::common::LanguageFeature::BackslashEscapes, - args.hasFlag(clang::driver::options::OPT_fbackslash, - clang::driver::options::OPT_fno_backslash, - false)); + args.hasFlag(clang::options::OPT_fbackslash, + clang::options::OPT_fno_backslash, false)); // -f{no-}logical-abbreviations opts.features.Enable( Fortran::common::LanguageFeature::LogicalAbbreviations, - args.hasFlag(clang::driver::options::OPT_flogical_abbreviations, - clang::driver::options::OPT_fno_logical_abbreviations, - false)); + args.hasFlag(clang::options::OPT_flogical_abbreviations, + clang::options::OPT_fno_logical_abbreviations, false)); // -f{no-}unsigned opts.features.Enable(Fortran::common::LanguageFeature::Unsigned, - args.hasFlag(clang::driver::options::OPT_funsigned, - clang::driver::options::OPT_fno_unsigned, - false)); + args.hasFlag(clang::options::OPT_funsigned, + clang::options::OPT_fno_unsigned, false)); // -f{no-}xor-operator - opts.features.Enable( - Fortran::common::LanguageFeature::XOROperator, - args.hasFlag(clang::driver::options::OPT_fxor_operator, - clang::driver::options::OPT_fno_xor_operator, false)); + opts.features.Enable(Fortran::common::LanguageFeature::XOROperator, + args.hasFlag(clang::options::OPT_fxor_operator, + clang::options::OPT_fno_xor_operator, + false)); // -fno-automatic - if (args.hasArg(clang::driver::options::OPT_fno_automatic)) { + if (args.hasArg(clang::options::OPT_fno_automatic)) { opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave); } // -f{no}-save-main-program - opts.features.Enable( - Fortran::common::LanguageFeature::SaveMainProgram, - args.hasFlag(clang::driver::options::OPT_fsave_main_program, - clang::driver::options::OPT_fno_save_main_program, false)); + opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram, + args.hasFlag(clang::options::OPT_fsave_main_program, + clang::options::OPT_fno_save_main_program, + false)); // -ffast-amd-memory-allocator - if (args.hasArg(clang::driver::options::OPT_ffast_amd_memory_allocator)) { + if (args.hasArg(clang::options::OPT_ffast_amd_memory_allocator)) { opts.features.Enable( (Fortran::common::LanguageFeature::AmdMemoryAllocator)); } - if (args.hasArg( - clang::driver::options::OPT_falternative_parameter_statement)) { + if (args.hasArg(clang::options::OPT_falternative_parameter_statement)) { opts.features.Enable(Fortran::common::LanguageFeature::OldStyleParameter); } if (const llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_finput_charset_EQ)) { + args.getLastArg(clang::options::OPT_finput_charset_EQ)) { llvm::StringRef argValue = arg->getValue(); if (argValue == "utf-8") { opts.encoding = Fortran::parser::Encoding::UTF_8; @@ -945,9 +926,9 @@ static std::string getOpenMPHeadersDir(const char *argv) { static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts, llvm::opt::ArgList &args) { // Add macros from the command line. - for (const auto *currentArg : args.filtered(clang::driver::options::OPT_D, - clang::driver::options::OPT_U)) { - if (currentArg->getOption().matches(clang::driver::options::OPT_D)) { + for (const auto *currentArg : + args.filtered(clang::options::OPT_D, clang::options::OPT_U)) { + if (currentArg->getOption().matches(clang::options::OPT_D)) { opts.addMacroDef(currentArg->getValue()); } else { opts.addMacroUndef(currentArg->getValue()); @@ -955,34 +936,33 @@ static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts, } // Add the ordered list of -I's. - for (const auto *currentArg : args.filtered(clang::driver::options::OPT_I)) + for (const auto *currentArg : args.filtered(clang::options::OPT_I)) opts.searchDirectoriesFromDashI.emplace_back(currentArg->getValue()); // Prepend the ordered list of -intrinsic-modules-path // to the default location to search. for (const auto *currentArg : - args.filtered(clang::driver::options::OPT_fintrinsic_modules_path)) + args.filtered(clang::options::OPT_fintrinsic_modules_path)) opts.searchDirectoriesFromIntrModPath.emplace_back(currentArg->getValue()); // -cpp/-nocpp - if (const auto *currentArg = args.getLastArg( - clang::driver::options::OPT_cpp, clang::driver::options::OPT_nocpp)) - opts.macrosFlag = - (currentArg->getOption().matches(clang::driver::options::OPT_cpp)) - ? PPMacrosFlag::Include - : PPMacrosFlag::Exclude; + if (const auto *currentArg = + args.getLastArg(clang::options::OPT_cpp, clang::options::OPT_nocpp)) + opts.macrosFlag = (currentArg->getOption().matches(clang::options::OPT_cpp)) + ? PPMacrosFlag::Include + : PPMacrosFlag::Exclude; // Enable -cpp based on -x unless explicitly disabled with -nocpp if (opts.macrosFlag != PPMacrosFlag::Exclude) - if (const auto *dashX = args.getLastArg(clang::driver::options::OPT_x)) + if (const auto *dashX = args.getLastArg(clang::options::OPT_x)) opts.macrosFlag = llvm::StringSwitch(dashX->getValue()) .Case("f95-cpp-input", PPMacrosFlag::Include) .Default(opts.macrosFlag); - opts.noReformat = args.hasArg(clang::driver::options::OPT_fno_reformat); + opts.noReformat = args.hasArg(clang::options::OPT_fno_reformat); opts.preprocessIncludeLines = - args.hasArg(clang::driver::options::OPT_fpreprocess_include_lines); - opts.noLineDirectives = args.hasArg(clang::driver::options::OPT_P); - opts.showMacros = args.hasArg(clang::driver::options::OPT_dM); + args.hasArg(clang::options::OPT_fpreprocess_include_lines); + opts.noLineDirectives = args.hasArg(clang::options::OPT_P); + opts.showMacros = args.hasArg(clang::options::OPT_dM); } /// Parses all semantic related arguments and populates the variables @@ -993,7 +973,7 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -J/module-dir option std::vector moduleDirList = - args.getAllArgValues(clang::driver::options::OPT_module_dir); + args.getAllArgValues(clang::options::OPT_module_dir); // User can only specify one -J/-module-dir directory, but may repeat // -J/-module-dir as long as the directory is the same each time. // https://gcc.gnu.org/onlinedocs/gfortran/Directory-Options.html @@ -1012,25 +992,25 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args, res.setModuleDir(moduleDirList[0]); // -fdebug-module-writer option - if (args.hasArg(clang::driver::options::OPT_fdebug_module_writer)) { + if (args.hasArg(clang::options::OPT_fdebug_module_writer)) { res.setDebugModuleDir(true); } // -fhermetic-module-files option - if (args.hasArg(clang::driver::options::OPT_fhermetic_module_files)) { + if (args.hasArg(clang::options::OPT_fhermetic_module_files)) { res.setHermeticModuleFileOutput(true); } // -module-suffix if (const auto *moduleSuffix = - args.getLastArg(clang::driver::options::OPT_module_suffix)) { + args.getLastArg(clang::options::OPT_module_suffix)) { res.setModuleFileSuffix(moduleSuffix->getValue()); } // -f{no-}analyzed-objects-for-unparse - res.setUseAnalyzedObjectsForUnparse(args.hasFlag( - clang::driver::options::OPT_fanalyzed_objects_for_unparse, - clang::driver::options::OPT_fno_analyzed_objects_for_unparse, true)); + res.setUseAnalyzedObjectsForUnparse( + args.hasFlag(clang::options::OPT_fanalyzed_objects_for_unparse, + clang::options::OPT_fno_analyzed_objects_for_unparse, true)); return diags.getNumErrors() == numErrorsBefore; } @@ -1047,7 +1027,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // chosen to match clang's behavior. // -pedantic - if (args.hasArg(clang::driver::options::OPT_pedantic)) { + if (args.hasArg(clang::options::OPT_pedantic)) { features.WarnOnAllNonstandard(); features.WarnOnAllUsage(); res.setEnableConformanceChecks(); @@ -1057,9 +1037,8 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -Werror option // TODO: Currently throws a Diagnostic for anything other than -W, // this has to change when other -W's are supported. - if (args.hasArg(clang::driver::options::OPT_W_Joined)) { - const auto &wArgs = - args.getAllArgValues(clang::driver::options::OPT_W_Joined); + if (args.hasArg(clang::options::OPT_W_Joined)) { + const auto &wArgs = args.getAllArgValues(clang::options::OPT_W_Joined); for (const auto &wArg : wArgs) { if (wArg == "error") { res.setWarnAsErr(true); @@ -1076,7 +1055,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -w - if (args.hasArg(clang::driver::options::OPT_w)) { + if (args.hasArg(clang::options::OPT_w)) { features.DisableAllWarnings(); res.setDisableWarnings(); } @@ -1096,7 +1075,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, unsigned numErrorsBefore = diags.getNumErrors(); // -fd-lines-as-code - if (args.hasArg(clang::driver::options::OPT_fd_lines_as_code)) { + if (args.hasArg(clang::options::OPT_fd_lines_as_code)) { if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) { const unsigned fdLinesAsWarning = diags.getCustomDiagID( clang::DiagnosticsEngine::Warning, @@ -1109,7 +1088,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -fd-lines-as-comments - if (args.hasArg(clang::driver::options::OPT_fd_lines_as_comments)) { + if (args.hasArg(clang::options::OPT_fd_lines_as_comments)) { if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) { const unsigned fdLinesAsWarning = diags.getCustomDiagID( clang::DiagnosticsEngine::Warning, @@ -1122,18 +1101,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -fdefault* family - if (args.hasArg(clang::driver::options::OPT_fdefault_real_8)) { + if (args.hasArg(clang::options::OPT_fdefault_real_8)) { res.getDefaultKinds().set_defaultRealKind(8); res.getDefaultKinds().set_doublePrecisionKind(16); } - if (args.hasArg(clang::driver::options::OPT_fdefault_integer_8)) { + if (args.hasArg(clang::options::OPT_fdefault_integer_8)) { res.getDefaultKinds().set_defaultIntegerKind(8); res.getDefaultKinds().set_subscriptIntegerKind(8); res.getDefaultKinds().set_sizeIntegerKind(8); res.getDefaultKinds().set_defaultLogicalKind(8); } - if (args.hasArg(clang::driver::options::OPT_fdefault_double_8)) { - if (!args.hasArg(clang::driver::options::OPT_fdefault_real_8)) { + if (args.hasArg(clang::options::OPT_fdefault_double_8)) { + if (!args.hasArg(clang::options::OPT_fdefault_real_8)) { // -fdefault-double-8 has to be used with -fdefault-real-8 // to be compatible with gfortran const unsigned diagID = diags.getCustomDiagID( @@ -1144,18 +1123,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // https://gcc.gnu.org/onlinedocs/gfortran/Fortran-Dialect-Options.html res.getDefaultKinds().set_doublePrecisionKind(8); } - if (args.hasArg(clang::driver::options::OPT_flarge_sizes)) + if (args.hasArg(clang::options::OPT_flarge_sizes)) res.getDefaultKinds().set_sizeIntegerKind(8); // -x cuda - auto language = args.getLastArgValue(clang::driver::options::OPT_x); + auto language = args.getLastArgValue(clang::options::OPT_x); if (language == "cuda") { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::CUDA); } // -fopenacc - if (args.hasArg(clang::driver::options::OPT_fopenacc)) { + if (args.hasArg(clang::options::OPT_fopenacc)) { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::OpenACC); } @@ -1163,8 +1142,8 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -std=f2018 // TODO: Set proper options when more fortran standards // are supported. - if (args.hasArg(clang::driver::options::OPT_std_EQ)) { - auto standard = args.getLastArgValue(clang::driver::options::OPT_std_EQ); + if (args.hasArg(clang::options::OPT_std_EQ)) { + auto standard = args.getLastArgValue(clang::options::OPT_std_EQ); // We only allow f2018 as the given standard if (standard == "f2018") { res.setEnableConformanceChecks(); @@ -1177,7 +1156,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } } // -fcoarray - if (args.hasArg(clang::driver::options::OPT_fcoarray)) { + if (args.hasArg(clang::options::OPT_fcoarray)) { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::Coarray); const unsigned diagID = @@ -1195,13 +1174,12 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, /// generated. static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp, - clang::driver::options::OPT_fno_openmp); - if (!arg || - arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) { - bool isSimdSpecified = args.hasFlag( - clang::driver::options::OPT_fopenmp_simd, - clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false); + llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fopenmp, + clang::options::OPT_fno_openmp); + if (!arg || arg->getOption().matches(clang::options::OPT_fno_openmp)) { + bool isSimdSpecified = + args.hasFlag(clang::options::OPT_fopenmp_simd, + clang::options::OPT_fno_openmp_simd, /*Default=*/false); if (!isSimdSpecified) return true; res.getLangOpts().OpenMPSimd = 1; @@ -1217,8 +1195,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, res.getLangOpts().OpenMPVersion = newestFullySupported; res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::OpenMP); - if (auto *arg = - args.getLastArg(clang::driver::options::OPT_fopenmp_version_EQ)) { + if (auto *arg = args.getLastArg(clang::options::OPT_fopenmp_version_EQ)) { llvm::ArrayRef ompVersions = llvm::omp::getOpenMPVersions(); unsigned oldVersions[] = {11, 20, 25, 30}; unsigned version = 0; @@ -1273,16 +1250,16 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } } - if (args.hasArg(clang::driver::options::OPT_fopenmp_force_usm)) { + if (args.hasArg(clang::options::OPT_fopenmp_force_usm)) { res.getLangOpts().OpenMPForceUSM = 1; } - if (args.hasArg(clang::driver::options::OPT_fopenmp_is_target_device)) { + if (args.hasArg(clang::options::OPT_fopenmp_is_target_device)) { res.getLangOpts().OpenMPIsTargetDevice = 1; // Get OpenMP host file path if any and report if a non existent file is // found - if (auto *arg = args.getLastArg( - clang::driver::options::OPT_fopenmp_host_ir_file_path)) { + if (auto *arg = + args.getLastArg(clang::options::OPT_fopenmp_host_ir_file_path)) { res.getLangOpts().OMPHostIRFile = arg->getValue(); if (!llvm::sys::fs::exists(res.getLangOpts().OMPHostIRFile)) diags.Report(clang::diag::err_omp_host_ir_file_not_found) @@ -1290,37 +1267,34 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } if (args.hasFlag( - clang::driver::options::OPT_fopenmp_assume_teams_oversubscription, - clang::driver::options:: - OPT_fno_openmp_assume_teams_oversubscription, + clang::options::OPT_fopenmp_assume_teams_oversubscription, + clang::options::OPT_fno_openmp_assume_teams_oversubscription, /*Default=*/false)) res.getLangOpts().OpenMPTeamSubscription = true; - if (args.hasArg(clang::driver::options::OPT_fopenmp_assume_no_thread_state)) + if (args.hasArg(clang::options::OPT_fopenmp_assume_no_thread_state)) res.getLangOpts().OpenMPNoThreadState = 1; - if (args.hasArg( - clang::driver::options::OPT_fopenmp_assume_no_nested_parallelism)) + if (args.hasArg(clang::options::OPT_fopenmp_assume_no_nested_parallelism)) res.getLangOpts().OpenMPNoNestedParallelism = 1; if (args.hasFlag( - clang::driver::options::OPT_fopenmp_assume_threads_oversubscription, - clang::driver::options:: - OPT_fno_openmp_assume_threads_oversubscription, + clang::options::OPT_fopenmp_assume_threads_oversubscription, + clang::options::OPT_fno_openmp_assume_threads_oversubscription, /*Default=*/false)) res.getLangOpts().OpenMPThreadSubscription = true; - if ((args.hasArg(clang::driver::options::OPT_fopenmp_target_debug) || - args.hasArg(clang::driver::options::OPT_fopenmp_target_debug_EQ))) { - res.getLangOpts().OpenMPTargetDebug = getLastArgIntValue( - args, clang::driver::options::OPT_fopenmp_target_debug_EQ, - res.getLangOpts().OpenMPTargetDebug, diags); + if ((args.hasArg(clang::options::OPT_fopenmp_target_debug) || + args.hasArg(clang::options::OPT_fopenmp_target_debug_EQ))) { + res.getLangOpts().OpenMPTargetDebug = + getLastArgIntValue(args, clang::options::OPT_fopenmp_target_debug_EQ, + res.getLangOpts().OpenMPTargetDebug, diags); if (!res.getLangOpts().OpenMPTargetDebug && - args.hasArg(clang::driver::options::OPT_fopenmp_target_debug)) + args.hasArg(clang::options::OPT_fopenmp_target_debug)) res.getLangOpts().OpenMPTargetDebug = 1; } - if (args.hasArg(clang::driver::options::OPT_no_offloadlib)) + if (args.hasArg(clang::options::OPT_no_offloadlib)) res.getLangOpts().NoGPULib = 1; } if (llvm::Triple(res.getTargetOpts().triple).isGPU()) { @@ -1336,8 +1310,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // Get the OpenMP target triples if any. - if (auto *arg = - args.getLastArg(clang::driver::options::OPT_offload_targets_EQ)) { + if (auto *arg = args.getLastArg(clang::options::OPT_offload_targets_EQ)) { enum ArchPtrSize { Arch16Bit, Arch32Bit, Arch64Bit }; auto getArchPtrSize = [](const llvm::Triple &triple) { if (triple.isArch16Bit()) @@ -1367,7 +1340,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } if (args.hasArg( - clang::driver::options::OPT_famd_allow_threadprivate_equivalence)) + clang::options::OPT_famd_allow_threadprivate_equivalence)) res.getLangOpts().AllowThreadprivateEquivalence = true; return diags.getNumErrors() == numErrorsBefore; @@ -1385,7 +1358,7 @@ static bool parseIntegerOverflowArgs(CompilerInvocation &invoc, clang::DiagnosticsEngine &diags) { Fortran::common::LangOptions &opts = invoc.getLangOpts(); - if (args.getLastArg(clang::driver::options::OPT_fwrapv)) + if (args.getLastArg(clang::options::OPT_fwrapv)) opts.setSignedOverflowBehavior(Fortran::common::LangOptions::SOB_Defined); return true; @@ -1404,7 +1377,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, Fortran::common::LangOptions &opts = invoc.getLangOpts(); if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_ffp_contract)) { + args.getLastArg(clang::options::OPT_ffp_contract)) { const llvm::StringRef val = a->getValue(); enum Fortran::common::LangOptions::FPModeKind fpContractMode; @@ -1421,31 +1394,31 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, opts.setFPContractMode(fpContractMode); } - if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) { + if (args.getLastArg(clang::options::OPT_menable_no_infs)) { opts.NoHonorInfs = true; } - if (args.getLastArg(clang::driver::options::OPT_menable_no_nans)) { + if (args.getLastArg(clang::options::OPT_menable_no_nans)) { opts.NoHonorNaNs = true; } - if (args.getLastArg(clang::driver::options::OPT_fapprox_func)) { + if (args.getLastArg(clang::options::OPT_fapprox_func)) { opts.ApproxFunc = true; } - if (args.getLastArg(clang::driver::options::OPT_fno_signed_zeros)) { + if (args.getLastArg(clang::options::OPT_fno_signed_zeros)) { opts.NoSignedZeros = true; } - if (args.getLastArg(clang::driver::options::OPT_mreassociate)) { + if (args.getLastArg(clang::options::OPT_mreassociate)) { opts.AssociativeMath = true; } - if (args.getLastArg(clang::driver::options::OPT_freciprocal_math)) { + if (args.getLastArg(clang::options::OPT_freciprocal_math)) { opts.ReciprocalMath = true; } - if (args.getLastArg(clang::driver::options::OPT_ffast_math)) { + if (args.getLastArg(clang::options::OPT_ffast_math)) { opts.NoHonorInfs = true; opts.NoHonorNaNs = true; opts.AssociativeMath = true; @@ -1455,7 +1428,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast); } - if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod)) + if (args.hasArg(clang::options::OPT_fno_fast_real_mod)) opts.NoFastRealMod = true; return true; @@ -1470,10 +1443,8 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, /// \param [out] diags DiagnosticsEngine to report erros with static bool parseVScaleArgs(CompilerInvocation &invoc, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - const auto *vscaleMin = - args.getLastArg(clang::driver::options::OPT_mvscale_min_EQ); - const auto *vscaleMax = - args.getLastArg(clang::driver::options::OPT_mvscale_max_EQ); + const auto *vscaleMin = args.getLastArg(clang::options::OPT_mvscale_min_EQ); + const auto *vscaleMax = args.getLastArg(clang::options::OPT_mvscale_max_EQ); if (!vscaleMin && !vscaleMax) return true; @@ -1521,8 +1492,7 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, // TODO: support --dependent-lib on other platforms when MLIR supports // !llvm.dependent.lib - if (args.hasArg(clang::driver::options::OPT_dependent_lib) && - !triple.isOSWindows()) { + if (args.hasArg(clang::options::OPT_dependent_lib) && !triple.isOSWindows()) { const unsigned diagID = diags.getCustomDiagID(clang::DiagnosticsEngine::Error, "--dependent-lib is only supported on Windows"); @@ -1530,12 +1500,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, return false; } - opts.DependentLibs = - args.getAllArgValues(clang::driver::options::OPT_dependent_lib); + opts.DependentLibs = args.getAllArgValues(clang::options::OPT_dependent_lib); // -flto=full/thin option. - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_flto_EQ)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_flto_EQ)) { llvm::StringRef s = a->getValue(); assert((s == "full" || s == "thin") && "Unknown LTO mode."); if (s == "full") @@ -1546,10 +1514,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, // -ffat-lto-objects if (const llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_ffat_lto_objects, - clang::driver::options::OPT_fno_fat_lto_objects)) { + args.getLastArg(clang::options::OPT_ffat_lto_objects, + clang::options::OPT_fno_fat_lto_objects)) { opts.PrepareForFatLTO = - arg->getOption().matches(clang::driver::options::OPT_ffat_lto_objects); + arg->getOption().matches(clang::options::OPT_ffat_lto_objects); if (opts.PrepareForFatLTO) { assert((opts.PrepareForFullLTO || opts.PrepareForThinLTO) && "Unknown LTO mode"); @@ -1590,8 +1558,8 @@ bool CompilerInvocation::createFromArgs( llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()); // Parse the arguments - const llvm::opt::OptTable &opts = clang::driver::getDriverOptTable(); - llvm::opt::Visibility visibilityMask(clang::driver::options::FC1Option); + const llvm::opt::OptTable &opts = clang::getDriverOptTable(); + llvm::opt::Visibility visibilityMask(clang::options::FC1Option); unsigned missingArgIndex, missingArgCount; llvm::opt::InputArgList args = opts.ParseArgs( commandLineArgs, missingArgIndex, missingArgCount, visibilityMask); @@ -1604,7 +1572,7 @@ bool CompilerInvocation::createFromArgs( } // Issue errors on unknown arguments - for (const auto *a : args.filtered(clang::driver::options::OPT_UNKNOWN)) { + for (const auto *a : args.filtered(clang::options::OPT_UNKNOWN)) { auto argString = a->getAsString(args); std::string nearest; if (opts.findNearest(argString, nearest, visibilityMask) > 1) @@ -1616,15 +1584,15 @@ bool CompilerInvocation::createFromArgs( } // -flang-experimental-hlfir - if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir) || - args.hasArg(clang::driver::options::OPT_emit_hlfir)) { + if (args.hasArg(clang::options::OPT_flang_experimental_hlfir) || + args.hasArg(clang::options::OPT_emit_hlfir)) { invoc.loweringOpts.setLowerToHighLevelFIR(true); } // -flang-deprecated-no-hlfir - if (args.hasArg(clang::driver::options::OPT_flang_deprecated_no_hlfir) && - !args.hasArg(clang::driver::options::OPT_emit_hlfir)) { - if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir)) { + if (args.hasArg(clang::options::OPT_flang_deprecated_no_hlfir) && + !args.hasArg(clang::options::OPT_emit_hlfir)) { + if (args.hasArg(clang::options::OPT_flang_experimental_hlfir)) { const unsigned diagID = diags.getCustomDiagID( clang::DiagnosticsEngine::Error, "Options '-flang-experimental-hlfir' and " @@ -1635,13 +1603,13 @@ bool CompilerInvocation::createFromArgs( } // -fno-ppc-native-vector-element-order - if (args.hasArg(clang::driver::options::OPT_fno_ppc_native_vec_elem_order)) { + if (args.hasArg(clang::options::OPT_fno_ppc_native_vec_elem_order)) { invoc.loweringOpts.setNoPPCNativeVecElemOrder(true); } // -f[no-]init-global-zero - if (args.hasFlag(clang::driver::options::OPT_finit_global_zero, - clang::driver::options::OPT_fno_init_global_zero, + if (args.hasFlag(clang::options::OPT_finit_global_zero, + clang::options::OPT_fno_init_global_zero, /*default=*/true)) invoc.loweringOpts.setInitGlobalZero(true); else @@ -1650,8 +1618,8 @@ bool CompilerInvocation::createFromArgs( // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or // -Rpass-analysis. This will be used later when processing and outputting the // remarks generated by LLVM in ExecuteCompilerInvocation.cpp. - for (auto *a : args.filtered(clang::driver::options::OPT_R_Group)) { - if (a->getOption().matches(clang::driver::options::OPT_R_value_Group)) + for (auto *a : args.filtered(clang::options::OPT_R_Group)) { + if (a->getOption().matches(clang::options::OPT_R_value_Group)) // This is -Rfoo=, where foo is the name of the diagnostic // group. Add only the remark option name to the diagnostics. e.g. for // -Rpass= we will add the string "pass". @@ -1664,20 +1632,19 @@ bool CompilerInvocation::createFromArgs( } // -frealloc-lhs is the default. - if (!args.hasFlag(clang::driver::options::OPT_frealloc_lhs, - clang::driver::options::OPT_fno_realloc_lhs, true)) + if (!args.hasFlag(clang::options::OPT_frealloc_lhs, + clang::options::OPT_fno_realloc_lhs, true)) invoc.loweringOpts.setReallocateLHS(false); - invoc.loweringOpts.setRepackArrays( - args.hasFlag(clang::driver::options::OPT_frepack_arrays, - clang::driver::options::OPT_fno_repack_arrays, - /*default=*/false)); + invoc.loweringOpts.setRepackArrays(args.hasFlag( + clang::options::OPT_frepack_arrays, clang::options::OPT_fno_repack_arrays, + /*default=*/false)); invoc.loweringOpts.setStackRepackArrays( - args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays, - clang::driver::options::OPT_fno_stack_repack_arrays, + args.hasFlag(clang::options::OPT_fstack_repack_arrays, + clang::options::OPT_fno_stack_repack_arrays, /*default=*/false)); - if (auto *arg = args.getLastArg( - clang::driver::options::OPT_frepack_arrays_contiguity_EQ)) + if (auto *arg = + args.getLastArg(clang::options::OPT_frepack_arrays_contiguity_EQ)) invoc.loweringOpts.setRepackArraysWhole(arg->getValue() == llvm::StringRef{"whole"}); @@ -1697,10 +1664,8 @@ bool CompilerInvocation::createFromArgs( // `mlirArgs`. Instead, you can use // * `-mllvm `, or // * `-mmlir `. - invoc.frontendOpts.llvmArgs = - args.getAllArgValues(clang::driver::options::OPT_mllvm); - invoc.frontendOpts.mlirArgs = - args.getAllArgValues(clang::driver::options::OPT_mmlir); + invoc.frontendOpts.llvmArgs = args.getAllArgValues(clang::options::OPT_mllvm); + invoc.frontendOpts.mlirArgs = args.getAllArgValues(clang::options::OPT_mmlir); success &= parseLangOptionsArgs(invoc, args, diags); @@ -1724,7 +1689,7 @@ bool CompilerInvocation::createFromArgs( } // Process the timing-related options. - if (args.hasArg(clang::driver::options::OPT_ftime_report)) + if (args.hasArg(clang::options::OPT_ftime_report)) invoc.enableTimers = true; invoc.setArgv0(argv0); diff --git a/flang/lib/FrontendTool/CMakeLists.txt b/flang/lib/FrontendTool/CMakeLists.txt index faf56e9d955a1..b69436c36d438 100644 --- a/flang/lib/FrontendTool/CMakeLists.txt +++ b/flang/lib/FrontendTool/CMakeLists.txt @@ -18,5 +18,6 @@ add_flang_library(flangFrontendTool CLANG_LIBS clangBasic + clangOptions clangDriver ) diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index 09ac129d3e689..7586be59ba01b 100644 --- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -23,7 +23,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/Pass/PassManager.h" #include "clang/Basic/DiagnosticFrontend.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" #include "llvm/Support/BuryPointer.h" @@ -153,10 +153,10 @@ updateDiagEngineForOptRemarks(clang::DiagnosticsEngine &diagsEng, bool executeCompilerInvocation(CompilerInstance *flang) { // Honor -help. if (flang->getFrontendOpts().showHelp) { - clang::driver::getDriverOptTable().printHelp( + clang::getDriverOptTable().printHelp( llvm::outs(), "flang -fc1 [options] file...", "LLVM 'Flang' Compiler", /*ShowHidden=*/false, /*ShowAllAliases=*/false, - llvm::opt::Visibility(clang::driver::options::FC1Option)); + llvm::opt::Visibility(clang::options::FC1Option)); return true; } diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 73ddd1ff80126..ef9894232b409 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -27,6 +27,26 @@ using namespace mlir; #define DEBUG_TYPE "fir-alias-analysis" +// Inspect for value-scoped Allocate effects and determine whether +// 'candidate' is a new allocation. Returns SourceKind::Allocate if a +// MemAlloc effect is attached +static fir::AliasAnalysis::SourceKind +classifyAllocateFromEffects(mlir::Operation *op, mlir::Value candidate) { + if (!op) + return fir::AliasAnalysis::SourceKind::Unknown; + auto interface = llvm::dyn_cast(op); + if (!interface) + return fir::AliasAnalysis::SourceKind::Unknown; + llvm::SmallVector effects; + interface.getEffects(effects); + for (mlir::MemoryEffects::EffectInstance &e : effects) { + if (mlir::isa(e.getEffect()) && + e.getValue() && e.getValue() == candidate) + return fir::AliasAnalysis::SourceKind::Allocate; + } + return fir::AliasAnalysis::SourceKind::Unknown; +} + //===----------------------------------------------------------------------===// // AliasAnalysis: alias //===----------------------------------------------------------------------===// @@ -535,6 +555,11 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, mlir::Operation *instantiationPoint{nullptr}; while (defOp && !breakFromLoop) { ty = defOp->getResultTypes()[0]; + // Value-scoped allocation detection via effects. + if (classifyAllocateFromEffects(defOp, v) == SourceKind::Allocate) { + type = SourceKind::Allocate; + break; + } llvm::TypeSwitch(defOp) .Case([&](auto op) { v = op.getVar(); @@ -554,11 +579,6 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, defOp = v.getDefiningOp(); } }) - .Case([&](auto op) { - // Unique memory allocation. - type = SourceKind::Allocate; - breakFromLoop = true; - }) .Case([&](auto op) { // Skip ConvertOp's and track further through the operand. v = op->getOperand(0); @@ -628,16 +648,23 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, type = SourceKind::Global; } else { auto def = llvm::cast(boxSrc.origin.u); - // TODO: Add support to fir.allocmem - if (auto allocOp = def.template getDefiningOp()) { - v = def; - defOp = v.getDefiningOp(); - type = SourceKind::Allocate; - } else if (isDummyArgument(def)) { - defOp = nullptr; - v = def; - } else { - type = SourceKind::Indirect; + bool classified = false; + if (auto defDefOp = def.getDefiningOp()) { + if (classifyAllocateFromEffects(defDefOp, def) == + SourceKind::Allocate) { + v = def; + defOp = defDefOp; + type = SourceKind::Allocate; + classified = true; + } + } + if (!classified) { + if (isDummyArgument(def)) { + defOp = nullptr; + v = def; + } else { + type = SourceKind::Indirect; + } } } breakFromLoop = true; diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp index cf7588f275d22..461deb8e4cb55 100644 --- a/flang/lib/Optimizer/Builder/CUFCommon.cpp +++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp @@ -9,6 +9,7 @@ #include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" +#include "flang/Optimizer/Dialect/Support/KindMapping.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" @@ -91,3 +92,25 @@ void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) { } } } + +int cuf::computeElementByteSize(mlir::Location loc, mlir::Type type, + fir::KindMapping &kindMap, + bool emitErrorOnFailure) { + auto eleTy = fir::unwrapSequenceType(type); + if (auto t{mlir::dyn_cast(eleTy)}) + return t.getWidth() / 8; + if (auto t{mlir::dyn_cast(eleTy)}) + return t.getWidth() / 8; + if (auto t{mlir::dyn_cast(eleTy)}) + return kindMap.getLogicalBitsize(t.getFKind()) / 8; + if (auto t{mlir::dyn_cast(eleTy)}) { + int elemSize = + mlir::cast(t.getElementType()).getWidth() / 8; + return 2 * elemSize; + } + if (auto t{mlir::dyn_cast(eleTy)}) + return kindMap.getCharacterBitsize(t.getFKind()) / 8; + if (emitErrorOnFailure) + mlir::emitError(loc, "unsupported type"); + return 0; +} diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 8d00272b09f42..5b1b0a2f6feab 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -263,28 +263,6 @@ static bool inDeviceContext(mlir::Operation *op) { return false; } -static int computeWidth(mlir::Location loc, mlir::Type type, - fir::KindMapping &kindMap) { - auto eleTy = fir::unwrapSequenceType(type); - if (auto t{mlir::dyn_cast(eleTy)}) - return t.getWidth() / 8; - if (auto t{mlir::dyn_cast(eleTy)}) - return t.getWidth() / 8; - if (eleTy.isInteger(1)) - return 1; - if (auto t{mlir::dyn_cast(eleTy)}) - return kindMap.getLogicalBitsize(t.getFKind()) / 8; - if (auto t{mlir::dyn_cast(eleTy)}) { - int elemSize = - mlir::cast(t.getElementType()).getWidth() / 8; - return 2 * elemSize; - } - if (auto t{mlir::dyn_cast_or_null(eleTy)}) - return kindMap.getCharacterBitsize(t.getFKind()) / 8; - mlir::emitError(loc, "unsupported type"); - return 0; -} - struct CUFAllocOpConversion : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -320,7 +298,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern { mlir::Value bytes; fir::KindMapping kindMap{fir::getKindMapping(mod)}; if (fir::isa_trivial(op.getInType())) { - int width = computeWidth(loc, op.getInType(), kindMap); + int width = cuf::computeElementByteSize(loc, op.getInType(), kindMap); bytes = builder.createIntegerConstant(loc, builder.getIndexType(), width); } else if (auto seqTy = mlir::dyn_cast_or_null( @@ -330,7 +308,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern { mlir::Type structTy = typeConverter->convertType(seqTy.getEleTy()); size = dl->getTypeSizeInBits(structTy) / 8; } else { - size = computeWidth(loc, seqTy.getEleTy(), kindMap); + size = cuf::computeElementByteSize(loc, seqTy.getEleTy(), kindMap); } mlir::Value width = builder.createIntegerConstant(loc, builder.getIndexType(), size); @@ -704,7 +682,7 @@ struct CUFDataTransferOpConversion typeConverter->convertType(fir::unwrapSequenceType(dstTy)); width = dl->getTypeSizeInBits(structTy) / 8; } else { - width = computeWidth(loc, dstTy, kindMap); + width = cuf::computeElementByteSize(loc, dstTy, kindMap); } mlir::Value widthValue = mlir::arith::ConstantOp::create( rewriter, loc, i64Ty, rewriter.getIntegerAttr(i64Ty, width)); diff --git a/flang/test/Driver/tco-emit-final-mlir.fir b/flang/test/Driver/tco-emit-final-mlir.fir index 75f8f153127af..7e934c921e773 100644 --- a/flang/test/Driver/tco-emit-final-mlir.fir +++ b/flang/test/Driver/tco-emit-final-mlir.fir @@ -15,5 +15,7 @@ func.func @_QPfoo() { %1 = fir.alloca i32 + %0 = arith.constant 0 : i32 + fir.store %0 to %1 : !fir.ref return } diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir index 8da8b828c18b9..613c8e274baad 100644 --- a/flang/test/Fir/alloc.fir +++ b/flang/test/Fir/alloc.fir @@ -372,8 +372,17 @@ func.func @alloca_unlimited_polymorphic_box() { %1 = fir.alloca !fir.class> %2 = fir.alloca !fir.box %3 = fir.alloca !fir.box> + // Add real uses so allocas are not trivially dead. + fir.call @__use_class_none(%0) : (!fir.ref>) -> () + fir.call @__use_class_array(%1) : (!fir.ref>>) -> () + fir.call @__use_box_none(%2) : (!fir.ref>) -> () + fir.call @__use_box_array(%3) : (!fir.ref>>) -> () return } +func.func private @__use_class_none(!fir.ref>) -> () +func.func private @__use_class_array(!fir.ref>>) -> () +func.func private @__use_box_none(!fir.ref>) -> () +func.func private @__use_box_array(!fir.ref>>) -> () // Note: allocmem of fir.box are not possible (fir::HeapType::verify does not // accept box types), so there is no equivalent of // alloca_unlimited_polymorphic_box for allocmem. diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir index 1645e1a407ad4..47fffb35a7d92 100644 --- a/flang/test/Fir/omp-reduction-embox-codegen.fir +++ b/flang/test/Fir/omp-reduction-embox-codegen.fir @@ -28,9 +28,11 @@ func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { omp.parallel reduction(byref @test_reduction %4 -> %arg0 : !fir.ref>) { omp.terminator } + func.call @__use_box_i32(%4) : (!fir.ref>) -> () return } +func.func private @__use_box_i32(!fir.ref>) -> () // basically we are testing that there isn't a crash // CHECK-LABEL: define void @_QQmain // CHECK-NEXT: alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 diff --git a/flang/test/Fir/pdt.fir b/flang/test/Fir/pdt.fir index a200cd7e7cc03..04f48e745d033 100644 --- a/flang/test/Fir/pdt.fir +++ b/flang/test/Fir/pdt.fir @@ -95,14 +95,14 @@ func.func @_QTt1P.f2.offset(%0 : i32, %1 : i32) -> i32 { // end program p func.func private @bar(!fir.ref>) +func.func private @__use_t1(!fir.ref,f2:!fir.char<1,?>}>>) -> () // CHECK-LABEL: define void @_QPfoo(i32 %0, i32 %1) func.func @_QPfoo(%arg0 : i32, %arg1 : i32) { // CHECK: %[[size:.*]] = call i64 @_QTt1P.mem.size(i32 %0, i32 %1) // CHECK: %[[alloc:.*]] = alloca i8, i64 %[[size]] %0 = fir.alloca !fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>(%arg0, %arg1 : i32, i32) - //%2 = fir.coordinate_of %0, f2 : (!fir.ref>) -> !fir.ref> - %2 = fir.zero_bits !fir.ref> - fir.call @bar(%2) : (!fir.ref>) -> () + // Keep alloca live without creating an unsupported coordinate_of on dynamic-sized field. + func.call @__use_t1(%0) : (!fir.ref,f2:!fir.char<1,?>}>>) -> () return } diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy-in.fir index f3c4b38962a0c..f1da1da9f9a5c 100644 --- a/flang/test/HLFIR/inline-hlfir-copy-in.fir +++ b/flang/test/HLFIR/inline-hlfir-copy-in.fir @@ -75,7 +75,7 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box> { // CHECK: %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box>) -> !fir.ref> // CHECK: %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref, !fir.ref, i1) // CHECK: fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath : (!fir.ref>, !fir.ref) -> () -// CHECK: hlfir.copy_out %16, %15#1 : (!fir.ref>>, i1) -> () +// CHECK: hlfir.copy_out %{{.*}}, %[[VAL_21:.*]]#1 : (!fir.ref>>, i1) -> () // CHECK: hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref, i1 // CHECK: return // CHECK: } diff --git a/flang/test/Lower/Intrinsics/c_f_pointer.f90 b/flang/test/Lower/Intrinsics/c_f_pointer.f90 index c1f1d7972d4b1..f54fda42cf51b 100644 --- a/flang/test/Lower/Intrinsics/c_f_pointer.f90 +++ b/flang/test/Lower/Intrinsics/c_f_pointer.f90 @@ -153,7 +153,6 @@ subroutine dynamic_shape_lower(cptr, fpr, shape, lower) ! CHECK: %[[VAL_2:.*]] = fir.shape %[[C_0]], %[[C_0]] : (index, index) -> !fir.shape<2> ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_1:.*]](%[[VAL_2]]) : (!fir.ptr>, !fir.shape<2>) -> !fir.box>> ! CHECK: fir.store %[[VAL_3]] to %[[VAL_0:.*]] : !fir.ref>>> -! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFdynamic_shape_lowerEn"} ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0:.*]], __address : (!fir.ref>) -> !fir.ref ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> !fir.ptr> diff --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90 index 9eae3a58884fa..f6fae1113b315 100644 --- a/flang/test/Lower/Intrinsics/system_clock.f90 +++ b/flang/test/Lower/Intrinsics/system_clock.f90 @@ -32,11 +32,9 @@ subroutine system_clock_test() ! CHECK-LABEL: @_QPss subroutine ss(count) - ! CHECK: %[[V_0:[0-9]+]] = fir.alloca !fir.box> {bindc_name = "count_max", uniq_name = "_QFssEcount_max"} ! CHECK: %[[V_1:[0-9]+]] = fir.alloca !fir.heap {uniq_name = "_QFssEcount_max.addr"} ! CHECK: %[[V_2:[0-9]+]] = fir.zero_bits !fir.heap ! CHECK: fir.store %[[V_2]] to %[[V_1]] : !fir.ref> - ! CHECK: %[[V_3:[0-9]+]] = fir.alloca !fir.box> {bindc_name = "count_rate", uniq_name = "_QFssEcount_rate"} ! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.ptr {uniq_name = "_QFssEcount_rate.addr"} ! CHECK: %[[V_5:[0-9]+]] = fir.zero_bits !fir.ptr ! CHECK: fir.store %[[V_5]] to %[[V_4]] : !fir.ref> diff --git a/flang/test/Lower/allocatables.f90 b/flang/test/Lower/allocatables.f90 index e62f92fa0c1c7..60b7de3301c48 100644 --- a/flang/test/Lower/allocatables.f90 +++ b/flang/test/Lower/allocatables.f90 @@ -56,7 +56,7 @@ subroutine foodim1() ! CHECK-DAG: fir.load %[[xAddrVar]] : !fir.ref>> deallocate(x) - ! CHECK: %[[xAddr1:.*]] = fir.load %1 : !fir.ref>> + ! CHECK: %[[xAddr1:.*]] = fir.load %{{.*}} : !fir.ref>> ! CHECK: fir.freemem %[[xAddr1]] ! CHECK: %[[nullAddr1:.*]] = fir.zero_bits !fir.heap> ! CHECK: fir.store %[[nullAddr1]] to %[[xAddrVar]] : !fir.ref>> @@ -67,10 +67,6 @@ subroutine foodim2() ! Test lowering of local allocatable specification real, allocatable :: x(:, :) ! CHECK-DAG: fir.alloca !fir.heap> {{{.*}}uniq_name = "_QFfoodim2Ex.addr"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb0"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext0"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb1"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext1"} end subroutine ! test lowering of character allocatables. Focus is placed on the length handling diff --git a/flang/test/Lower/character-local-variables.f90 b/flang/test/Lower/character-local-variables.f90 index d5b959eca1ff6..6325229993a25 100644 --- a/flang/test/Lower/character-local-variables.f90 +++ b/flang/test/Lower/character-local-variables.f90 @@ -8,6 +8,7 @@ subroutine scalar_cst_len() character(10) :: c ! CHECK: fir.alloca !fir.char<1,10> {{{.*}}uniq_name = "_QFscalar_cst_lenEc"} + print *, c end subroutine ! CHECK-LABEL: func @_QPscalar_dyn_len @@ -19,12 +20,14 @@ subroutine scalar_dyn_len(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32 ! CHECK: fir.alloca !fir.char<1,?>(%[[l]] : i32) {{{.*}}uniq_name = "_QFscalar_dyn_lenEc"} + print *, c end subroutine ! CHECK-LABEL: func @_QPcst_array_cst_len subroutine cst_array_cst_len() character(10) :: c(20) ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPcst_array_dyn_len @@ -36,6 +39,7 @@ subroutine cst_array_dyn_len(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32 ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i32) {{{.*}}uniq_name = "_QFcst_array_dyn_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPdyn_array_cst_len @@ -48,6 +52,7 @@ subroutine dyn_array_cst_len(n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_lenEc"} + print *, c(1) end subroutine ! CHECK: func @_QPdyn_array_dyn_len @@ -63,12 +68,14 @@ subroutine dyn_array_dyn_len(l, n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>(%[[l]] : i32), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPcst_array_cst_len_lb subroutine cst_array_cst_len_lb() character(10) :: c(11:30) ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPcst_array_dyn_len_lb @@ -80,6 +87,7 @@ subroutine cst_array_dyn_len_lb(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i64 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i64 ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i64) {{{.*}}uniq_name = "_QFcst_array_dyn_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPdyn_array_cst_len_lb @@ -94,6 +102,7 @@ subroutine dyn_array_cst_len_lb(n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPdyn_array_dyn_len_lb @@ -111,6 +120,7 @@ subroutine dyn_array_dyn_len_lb(l, n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>(%[[l]] : i64), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_len_lbEc"} + print *, c(11) end subroutine ! Test that the length of assumed length parameter is correctly deduced in lowering. @@ -129,4 +139,5 @@ subroutine assumed_length_param(n) subroutine scalar_cst_neg_len() character(-1) :: c ! CHECK: fir.alloca !fir.char<1,0> {{{.*}}uniq_name = "_QFscalar_cst_neg_lenEc"} + print *, c end subroutine diff --git a/flang/test/Lower/derived-types.f90 b/flang/test/Lower/derived-types.f90 index 4d36a7632b070..7e36ec0cfe93f 100644 --- a/flang/test/Lower/derived-types.f90 +++ b/flang/test/Lower/derived-types.f90 @@ -35,6 +35,8 @@ subroutine local_derived() ! CHECK-DAG: fir.alloca !fir.type<_QMdTr{x:f32}> type(r) :: some_r type(c2) :: some_c2 + print *, some_c2%ch_array(1,1) + print *, some_r%x end subroutine ! CHECK-LABEL: func @_QMdPsaved_derived( diff --git a/flang/test/Lower/do_loop_unstructured.f90 b/flang/test/Lower/do_loop_unstructured.f90 index 3b03850b43bb2..9c7d874a1aac8 100644 --- a/flang/test/Lower/do_loop_unstructured.f90 +++ b/flang/test/Lower/do_loop_unstructured.f90 @@ -235,6 +235,7 @@ subroutine nested_structured_in_unstructured() subroutine unstructured_do_concurrent logical :: success do concurrent (i=1:10) local(success) + success = .false. error stop "fail" enddo end diff --git a/flang/test/Lower/forall/array-pointer.f90 b/flang/test/Lower/forall/array-pointer.f90 index fd3efed736c39..6b8c5648af29e 100644 --- a/flang/test/Lower/forall/array-pointer.f90 +++ b/flang/test/Lower/forall/array-pointer.f90 @@ -318,7 +318,6 @@ end subroutine s2_3 ! CHECK-LABEL: func @_QPs2_3( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box>}>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} -! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box>> {bindc_name = "y", fir.target, uniq_name = "_QFs2_3Ey"} ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.heap> {uniq_name = "_QFs2_3Ey.addr"} ! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.lb0"} ! CHECK: %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.ext0"} diff --git a/flang/test/Lower/forall/forall-allocatable.f90 b/flang/test/Lower/forall/forall-allocatable.f90 index 96cd37ea3ed8a..8e54d282aea4b 100644 --- a/flang/test/Lower/forall/forall-allocatable.f90 +++ b/flang/test/Lower/forall/forall-allocatable.f90 @@ -13,20 +13,19 @@ end subroutine forall_with_allocatable ! CHECK-LABEL: func @_QPforall_with_allocatable( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box>{{.*}}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} -! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box>> {bindc_name = "arr", uniq_name = "_QFforall_with_allocatableEarr"} -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.heap> {uniq_name = "_QFforall_with_allocatableEarr.addr"} -! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"} -! CHECK: %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"} -! CHECK: %[[VAL_6:.*]] = fir.zero_bits !fir.heap> -! CHECK: fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref>> +! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.heap> {uniq_name = "_QFforall_with_allocatableEarr.addr"} +! CHECK: %[[VAL_3:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"} +! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"} +! CHECK: %[[VAL_5:.*]] = fir.zero_bits !fir.heap> +! CHECK: fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref>> ! CHECK: %[[VAL_7:.*]] = arith.constant 5 : i32 ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index ! CHECK: %[[VAL_9:.*]] = arith.constant 15 : i32 ! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index ! CHECK: %[[VAL_11:.*]] = arith.constant 1 : index -! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_4]] : !fir.ref -! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_5]] : !fir.ref -! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_3]] : !fir.ref>> +! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_3]] : !fir.ref +! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_4]] : !fir.ref +! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_2]] : !fir.ref>> ! CHECK: %[[VAL_15:.*]] = fir.shape_shift %[[VAL_12]], %[[VAL_13]] : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_16:.*]] = fir.array_load %[[VAL_14]](%[[VAL_15]]) : (!fir.heap>, !fir.shapeshift<1>) -> !fir.array ! CHECK: %[[VAL_17:.*]] = fir.array_load %[[VAL_0]] : (!fir.box>) -> !fir.array diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90 index 2fea84b03891a..5ee6562733dae 100644 --- a/flang/test/Lower/loops.f90 +++ b/flang/test/Lower/loops.f90 @@ -90,7 +90,6 @@ subroutine lis(n) ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}}, %{{.*}} {bindc_name = "a", fir.target, uniq_name = "_QFlisEa"} ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "r", uniq_name = "_QFlisEr"} ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "s", uniq_name = "_QFlisEs"} - ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "t", uniq_name = "_QFlisEt"} integer, target :: a(n,n,n) ! operand via p integer :: r(n,n) ! result, unspecified locality integer :: s(n,n) ! shared locality diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90 index f586380e653a0..bc4eed54282df 100644 --- a/flang/test/Lower/polymorphic.f90 +++ b/flang/test/Lower/polymorphic.f90 @@ -287,7 +287,6 @@ subroutine pointer_assign_parent(p) ! First test is here to have a reference with non polymorphic on both sides. ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_parent( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref> {fir.bindc_name = "p", fir.target}) { -! CHECK: %[[TP:.*]] = fir.alloca !fir.box>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp"} ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr> {uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp.addr"} ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr> ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref>> @@ -302,7 +301,6 @@ subroutine pointer_assign_non_poly(p) ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_non_poly( ! CHECK-SAME: %arg0: !fir.class> {fir.bindc_name = "p", fir.target}) { -! CHECK: %[[TP:.*]] = fir.alloca !fir.box>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp"} ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr> {uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp.addr"} ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr> ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref>> @@ -1103,11 +1101,9 @@ subroutine class_with_entry(a) ! CHECK-LABEL: func.func @_QMpolymorphic_testPclass_with_entry( ! CHECK-SAME: %[[A:.*]]: !fir.class> {fir.bindc_name = "a"}) { -! CHECK: %[[B:.*]] = fir.alloca !fir.class> {bindc_name = "b", uniq_name = "_QMpolymorphic_testFclass_with_entryEb"} ! CHECK-LABEL: func.func @_QMpolymorphic_testPd( ! CHECK-SAME: %[[B:.*]]: !fir.class> {fir.bindc_name = "b"}) { -! CHECK: %[[A:.*]] = fir.alloca !fir.class> {bindc_name = "a", uniq_name = "_QMpolymorphic_testFclass_with_entryEa"} subroutine class_array_with_entry(a) class(p1) :: a(:), b(:) diff --git a/flang/test/Lower/statement-function.f90 b/flang/test/Lower/statement-function.f90 index cfec06c35baa8..fe07649e669af 100644 --- a/flang/test/Lower/statement-function.f90 +++ b/flang/test/Lower/statement-function.f90 @@ -129,7 +129,6 @@ integer function test_stmt_character_with_different_length_2(c, n) character(n) :: argc character(*) :: c ! CHECK: %[[unboxed:.*]]:2 = fir.unboxchar %[[arg0]] : - ! CHECK: fir.load %[[arg1]] : !fir.ref ! CHECK: %[[n:.*]] = fir.load %[[arg1]] : !fir.ref ! CHECK: %[[n_is_positive:.*]] = arith.cmpi sgt, %[[n]], %c0{{.*}} : i32 ! CHECK: %[[len:.*]] = arith.select %[[n_is_positive]], %[[n]], %c0{{.*}} : i32 diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir index 4a417ed981ab1..25fc73153003a 100644 --- a/flang/test/Transforms/stack-arrays.fir +++ b/flang/test/Transforms/stack-arrays.fir @@ -3,13 +3,17 @@ // Simplest transformation func.func @simple() { %0 = fir.allocmem !fir.array<42xi32> + %c0_s = arith.constant 0 : index + %c0_i32_s = arith.constant 0 : i32 + %ref_s = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt_s = fir.coordinate_of %ref_s, %c0_s : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_s to %elt_s : !fir.ref fir.freemem %0 : !fir.heap> return } -// CHECK: func.func @simple() { -// CHECK-NEXT: fir.alloca !fir.array<42xi32> -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK: func.func @simple() +// CHECK: fir.alloca !fir.array<42xi32> +// CHECK: return // Check fir.must_be_heap allocations are not moved func.func @must_be_heap() { @@ -17,7 +21,7 @@ func.func @must_be_heap() { fir.freemem %0 : !fir.heap> return } -// CHECK: func.func @must_be_heap() { +// CHECK-LABEL: func.func @must_be_heap() // CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true} // CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap> // CHECK-NEXT: return @@ -36,7 +40,7 @@ func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { } return } -// CHECK: func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { +// CHECK-LABEL: func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) // CHECK-NEXT: %[[C42:.*]] = arith.constant 42 : index // CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array, %[[C42]] {uniq_name = "_QFdfa1Earr.alloc"} // CHECK-NEXT: %[[LOGICAL:.*]] = fir.load %arg0 : !fir.ref> @@ -57,7 +61,7 @@ func.func @dfa2(%arg0: i1) { } return } -// CHECK: func.func @dfa2(%arg0: i1) { +// CHECK-LABEL: func.func @dfa2(%arg0: i1) // CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array<1xi8> // CHECK-NEXT: scf.if %arg0 { // CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> @@ -74,15 +78,16 @@ func.func @dfa3(%arg0: i1) { } else { fir.freemem %a : !fir.heap> } + %c0_d3 = arith.constant 0 : index + %c0_i8_d3 = arith.constant 0 : i8 + %ref_d3 = fir.convert %a : (!fir.heap>) -> !fir.ref> + %elt_d3 = fir.coordinate_of %ref_d3, %c0_d3 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i8_d3 to %elt_d3 : !fir.ref return } -// CHECK: func.func @dfa3(%arg0: i1) { -// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> -// CHECK-NEXT: fir.if %arg0 { -// CHECK-NEXT: } else { -// CHECK-NEXT: } -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK: func.func @dfa3(%arg0: i1) +// CHECK: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> +// CHECK: return func.func private @dfa3a_foo(!fir.ref>) -> () func.func private @dfa3a_bar(!fir.ref>) -> () @@ -101,7 +106,7 @@ func.func @dfa3a(%arg0: i1) { } return } -// CHECK: func.func @dfa3a(%arg0: i1) { +// CHECK-LABEL: func.func @dfa3a(%arg0: i1) // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> // CHECK-NEXT: %[[HEAP:.*]] = fir.convert %[[MEM]] : (!fir.ref>) -> !fir.heap> // CHECK-NEXT: fir.if %arg0 { @@ -123,13 +128,18 @@ func.func @placement1() { // operand is now available %4 = fir.allocmem !fir.array, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref1 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt1 = fir.coordinate_of %ref1, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32 to %elt1 : !fir.ref fir.freemem %4 : !fir.heap> return } -// CHECK: func.func @placement1() { +// CHECK-LABEL: func.func @placement1() // CHECK-NEXT: %[[ARG:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[ARG]] -// CHECK-NEXT: return +// CHECK: return // CHECK-NEXT: } // check that if there are no operands, then the alloca is placed early @@ -140,16 +150,21 @@ func.func @placement2() { %3 = arith.addi %1, %2 : index %4 = fir.allocmem !fir.array<42xi32> // ... + %c0_p2 = arith.constant 0 : index + %c0_i32_p2 = arith.constant 0 : i32 + %ref_p2 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt_p2 = fir.coordinate_of %ref_p2, %c0_p2 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_p2 to %elt_p2 : !fir.ref fir.freemem %4 : !fir.heap> return } -// CHECK: func.func @placement2() { -// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> -// CHECK-NEXT: %[[ONE:.*]] = arith.constant 1 : index -// CHECK-NEXT: %[[TWO:.*]] = arith.constant 2 : index -// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK-LABEL: func.func @placement2() +// CHECK: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> +// CHECK: %[[ONE:.*]] = arith.constant 1 : index +// CHECK: %[[TWO:.*]] = arith.constant 2 : index +// CHECK: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index +// CHECK: return +// CHECK: } // check that stack allocations which must be placed in loops use stacksave func.func @placement3() { @@ -162,12 +177,17 @@ func.func @placement3() { // operand is now available %4 = fir.allocmem !fir.array, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref2 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt2 = fir.coordinate_of %ref2, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32 to %elt2 : !fir.ref fir.freemem %4 : !fir.heap> fir.result %3, %c1_i32 : index, i32 } return } -// CHECK: func.func @placement3() { +// CHECK-LABEL: func.func @placement3() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -176,7 +196,7 @@ func.func @placement3() { // CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index // CHECK-NEXT: %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[SUM]] -// CHECK-NEXT: llvm.intr.stackrestore %[[SP]] : !llvm.ptr +// CHECK: llvm.intr.stackrestore %[[SP]] : !llvm.ptr // CHECK-NEXT: fir.result // CHECK-NEXT: } // CHECK-NEXT: return @@ -194,12 +214,17 @@ func.func @placement4(%arg0 : i1) { // operand is now available %4 = fir.allocmem !fir.array, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref3 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt3 = fir.coordinate_of %ref3, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32 to %elt3 : !fir.ref fir.freemem %4 : !fir.heap> cf.cond_br %arg0, ^bb1, ^bb2 ^bb2: return } -// CHECK: func.func @placement4(%arg0: i1) { +// CHECK-LABEL: func.func @placement4(%arg0: i1) // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index @@ -208,7 +233,7 @@ func.func @placement4(%arg0 : i1) { // CHECK-NEXT: %[[C3:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[C3]] -// CHECK-NEXT: llvm.intr.stackrestore %[[SP]] : !llvm.ptr +// CHECK: llvm.intr.stackrestore %[[SP]] : !llvm.ptr // CHECK-NEXT: cf.cond_br %arg0, ^bb1, ^bb2 // CHECK-NEXT: ^bb2: // CHECK-NEXT: return @@ -230,7 +255,7 @@ func.func @placement5() { } return } -// CHECK: func.func @placement5() { +// CHECK-LABEL: func.func @placement5() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -268,7 +293,7 @@ func.func @placement6(%arg0: i1) { fir.freemem %4 : !fir.heap> cf.br ^bb1 } -// CHECK: func.func @placement6(%arg0: i1) { +// CHECK-LABEL: func.func @placement6(%arg0: i1) // CHECK-NEXT: %[[c1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[c1_i32:.*]] = fir.convert %[[c1]] : (index) -> i32 // CHECK-NEXT: %[[c2:.*]] = arith.constant 2 : index @@ -289,6 +314,11 @@ func.func @placement6(%arg0: i1) { // Check multiple returns, where the memory is always freed func.func @returns(%arg0: i1) { %0 = fir.allocmem !fir.array<42xi32> + %c0_ret = arith.constant 0 : index + %c0_i32_ret = arith.constant 0 : i32 + %ref_ret = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt_ret = fir.coordinate_of %ref_ret, %c0_ret : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_ret to %elt_ret : !fir.ref cf.cond_br %arg0, ^bb1, ^bb2 ^bb1: fir.freemem %0 : !fir.heap> @@ -297,9 +327,9 @@ func.func @returns(%arg0: i1) { fir.freemem %0 : !fir.heap> return } -// CHECK: func.func @returns(%[[COND:.*]]: i1) { -// CHECK-NEXT: %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32> -// CHECK-NEXT: cf.cond_br %[[COND]], ^bb1, ^bb2 +// CHECK-LABEL: func.func @returns( +// CHECK: %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32> +// CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 // CHECK-NEXT: ^bb1: // CHECK-NEXT: return // CHECK-NEXT: ^bb2: @@ -309,6 +339,11 @@ func.func @returns(%arg0: i1) { // Check multiple returns, where the memory is not freed on one branch func.func @returns2(%arg0: i1) { %0 = fir.allocmem !fir.array<42xi32> + %c0_ret2 = arith.constant 0 : index + %c0_i32_ret2 = arith.constant 0 : i32 + %ref_ret2 = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt_ret2 = fir.coordinate_of %ref_ret2, %c0_ret2 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_ret2 to %elt_ret2 : !fir.ref cf.cond_br %arg0, ^bb1, ^bb2 ^bb1: fir.freemem %0 : !fir.heap> @@ -316,9 +351,9 @@ func.func @returns2(%arg0: i1) { ^bb2: return } -// CHECK: func.func @returns2(%[[COND:.*]]: i1) { -// CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> -// CHECK-NEXT: cf.cond_br %[[COND]], ^bb1, ^bb2 +// CHECK-LABEL: func.func @returns2( +// CHECK: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> +// CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 // CHECK-NEXT: ^bb1: // CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap> // CHECK-NEXT: return @@ -338,7 +373,7 @@ func.func @omp_placement1() { } return } -// CHECK: func.func @omp_placement1() { +// CHECK-LABEL: func.func @omp_placement1() // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> // CHECK-NEXT: %[[MEM_CONV:.*]] = fir.convert %[[MEM]] : (!fir.ref>) -> !fir.heap> // CHECK-NEXT: omp.sections { @@ -353,19 +388,21 @@ func.func @omp_placement1() { // function terminated by stop statement func.func @stop_terminator() { %0 = fir.allocmem !fir.array<42xi32> + %c0 = arith.constant 0 : index + %c0_i32_st = arith.constant 0 : i32 + %ref4 = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt4 = fir.coordinate_of %ref4, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_st to %elt4 : !fir.ref fir.freemem %0 : !fir.heap> %c0_i32 = arith.constant 0 : i32 %false = arith.constant false fir.call @_FortranAStopStatement(%c0_i32, %false, %false) : (i32, i1, i1) -> () fir.unreachable } -// CHECK: func.func @stop_terminator() { -// CHECK-NEXT: fir.alloca !fir.array<42xi32> -// CHECK-NEXT: %[[ZERO:.*]] = arith.constant 0 : i32 -// CHECK-NEXT: %[[FALSE:.*]] = arith.constant false -// CHECK-NEXT: fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> () -// CHECK-NEXT: fir.unreachable -// CHECK-NEXT: } +// CHECK-LABEL: func.func @stop_terminator() +// CHECK: fir.alloca !fir.array<42xi32> +// CHECK: fir.call @_FortranAStopStatement( +// CHECK: fir.unreachable // check that stack allocations that use fir.declare which must be placed in loops @@ -387,7 +424,7 @@ func.func @placement_loop_declare() { } return } -// CHECK: func.func @placement_loop_declare() { +// CHECK-LABEL: func.func @placement_loop_declare() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -415,7 +452,7 @@ func.func @lookthrough() { fir.freemem %4 : !fir.heap> return } -// CHECK: func.func @lookthrough() { +// CHECK-LABEL: func.func @lookthrough() // CHECK: fir.alloca !fir.array<42xi32> // CHECK-NOT: fir.freemem @@ -457,6 +494,6 @@ func.func @finding_freemem_in_block() { ^bb3: // pred: ^bb1 return } -// CHECK: func.func @finding_freemem_in_block() { +// CHECK-LABEL: func.func @finding_freemem_in_block() // CHECK: fir.alloca !fir.array // CHECK-NOT: fir.freemem diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt index 801fc324e888d..3caec17b3edbb 100644 --- a/flang/tools/flang-driver/CMakeLists.txt +++ b/flang/tools/flang-driver/CMakeLists.txt @@ -26,6 +26,7 @@ target_link_libraries(flang clang_target_link_libraries(flang PRIVATE clangDriver + clangOptions clangBasic ) diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp index bd878b7a642f1..0840255a739f3 100644 --- a/flang/tools/flang-driver/driver.cpp +++ b/flang/tools/flang-driver/driver.cpp @@ -52,9 +52,9 @@ createAndPopulateDiagOpts(llvm::ArrayRef argv) { // Any errors that would be diagnosed here will also be diagnosed later, // when the DiagnosticsEngine actually exists. unsigned missingArgIndex, missingArgCount; - llvm::opt::InputArgList args = clang::driver::getDriverOptTable().ParseArgs( + llvm::opt::InputArgList args = clang::getDriverOptTable().ParseArgs( argv.slice(1), missingArgIndex, missingArgCount, - llvm::opt::Visibility(clang::driver::options::FlangOption)); + llvm::opt::Visibility(clang::options::FlangOption)); (void)Fortran::frontend::parseDiagnosticArgs(*diagOpts, args); diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp index bb76eee90efd2..08db4859c6417 100644 --- a/libc/src/__support/OSUtil/linux/fcntl.cpp +++ b/libc/src/__support/OSUtil/linux/fcntl.cpp @@ -66,7 +66,7 @@ ErrorOr fcntl(int fd, int cmd, void *arg) { LIBC_NAMESPACE::syscall_impl(FCNTL_SYSCALL_ID, fd, cmd, &flk64); // On failure, return if (ret < 0) - return Error(-1); + return Error(-ret); // Check for overflow, i.e. the offsets are not the same when cast // to off_t from off64_t. if (static_cast(flk64.l_len) != flk64.l_len || diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h index 564441d3bf51a..c47a03d741f98 100644 --- a/libc/src/stdio/printf_core/vfprintf_internal.h +++ b/libc/src/stdio/printf_core/vfprintf_internal.h @@ -51,8 +51,11 @@ LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); } LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, ::FILE *f) { // Need to use system errno in this case, as system write will set this errno - // which we need to propagate back into our code. - return {::fwrite_unlocked(ptr, size, nmemb, f), errno}; + // which we need to propagate back into our code. fwrite only modifies errno + // if there was an error, and errno may have previously been nonzero. Only + // return errno if there was an error. + size_t members_written = ::fwrite_unlocked(ptr, size, nmemb, f); + return {members_written, members_written == nmemb ? 0 : errno}; } #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace internal diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp index 84feb34e537a0..d008aea54b425 100644 --- a/libc/test/src/fcntl/fcntl_test.cpp +++ b/libc/test/src/fcntl/fcntl_test.cpp @@ -94,68 +94,105 @@ TEST_F(LlvmLibcFcntlTest, FcntlSetFl) { ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); } -TEST_F(LlvmLibcFcntlTest, FcntlGetLkRead) { - using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test"; - auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); - - struct flock flk, svflk; - int retVal; - int fd = - LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(fd, 0); - - flk.l_type = F_RDLCK; - flk.l_start = 0; - flk.l_whence = SEEK_SET; - flk.l_len = 50; - - // copy flk into svflk - svflk = flk; - - retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - ASSERT_NE((int)flk.l_type, F_WRLCK); // File should not be write locked. - - retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - - ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); -} - -TEST_F(LlvmLibcFcntlTest, FcntlGetLkWrite) { - using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test"; - auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); - - struct flock flk, svflk; - int retVal; - int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(fd, 0); - - flk.l_type = F_WRLCK; - flk.l_start = 0; - flk.l_whence = SEEK_SET; - flk.l_len = 0; - - // copy flk into svflk - svflk = flk; - - retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - ASSERT_NE((int)flk.l_type, F_RDLCK); // File should not be read locked. - - retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - - ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); -} +/* Tests that are common between OFD and traditional variants of fcntl locks. */ +template +class LibcFcntlCommonLockTests : public LlvmLibcFcntlTest { +public: + void GetLkRead() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + + struct flock flk = {}; + struct flock svflk = {}; + int retVal; + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + flk.l_type = F_RDLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 50; + + // copy flk into svflk + svflk = flk; + + retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + ASSERT_NE((int)svflk.l_type, F_WRLCK); // File should not be write locked. + + retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + } + + void GetLkWrite() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + + struct flock flk = {}; + struct flock svflk = {}; + int retVal; + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + flk.l_type = F_WRLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 0; + + // copy flk into svflk + svflk = flk; + + retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + ASSERT_NE((int)svflk.l_type, F_RDLCK); // File should not be read locked. + + retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + } + + void UseAfterClose() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = + "testdata/fcntl_use_after_close.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + + flock flk = {}; + flk.l_type = F_RDLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 50; + ASSERT_EQ(-1, LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &flk)); + ASSERT_ERRNO_EQ(EBADF); + } +}; + +#define COMMON_LOCK_TESTS(NAME, GETLK_CMD, SETLK_CMD) \ + using NAME = LibcFcntlCommonLockTests; \ + TEST_F(NAME, GetLkRead) { GetLkRead(); } \ + TEST_F(NAME, GetLkWrite) { GetLkWrite(); } \ + TEST_F(NAME, UseAfterClose) { UseAfterClose(); } \ + static_assert(true, "Require semicolon.") + +COMMON_LOCK_TESTS(LlvmLibcFcntlProcessAssociatedLockTest, F_GETLK, F_SETLK); +COMMON_LOCK_TESTS(LlvmLibcFcntlOpenFileDescriptionLockTest, F_OFD_GETLK, + F_OFD_SETLK); TEST_F(LlvmLibcFcntlTest, UseAfterClose) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp index 8b4a3e0a7c3fb..bfbd85ea34203 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp @@ -56,7 +56,7 @@ using namespace lldb; using namespace lldb_private; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_STR_TABLE_CODE static Status ExceptionMaskValidator(const char *string, void *unused) { @@ -1124,7 +1124,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType( #define OPTION(PREFIX_OFFSET, NAME_OFFSET, VAR, ...) \ llvm::StringRef opt_##VAR = OptionStrTable[NAME_OFFSET]; \ (void)opt_##VAR; -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTION minimum_version_option << '-'; switch (sdk_type) { diff --git a/lldb/test/Shell/helper/build.py b/lldb/test/Shell/helper/build.py index a5a7e997be044..1fa8aab92c128 100755 --- a/lldb/test/Shell/helper/build.py +++ b/lldb/test/Shell/helper/build.py @@ -804,7 +804,19 @@ def _get_link_command(self): args.extend(self._obj_file_names()) if sys.platform == "darwin": + # By default, macOS doesn't allow injecting the ASAN + # runtime into system processes. + system_clang = ( + subprocess.check_output(["xcrun", "-find", "clang"]) + .strip() + .decode("utf-8") + ) + system_liblto = os.path.join( + os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib" + ) args.extend(["-isysroot", self.apple_sdk]) + args.extend(["-Wl,-lto_library", "-Wl," + system_liblto]) + elif self.objc_gnustep_lib: args.extend(["-L", self.objc_gnustep_lib, "-lobjc"]) if sys.platform == "linux": diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py index 728f6347242f1..faa29d23387cc 100644 --- a/lldb/test/Shell/helper/toolchain.py +++ b/lldb/test/Shell/helper/toolchain.py @@ -250,6 +250,15 @@ def use_support_substitutions(config): "-L{}".format(config.libcxx_libs_dir), "-lc++", ] + # By default, macOS doesn't allow injecting the ASAN runtime into system processes. + if platform.system() in ["Darwin"] and config.llvm_use_sanitizer: + system_clang = ( + subprocess.check_output(["xcrun", "-find", "clang"]).strip().decode("utf-8") + ) + system_liblto = os.path.join( + os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib" + ) + host_flags += ["-Wl,-lto_library", "-Wl," + system_liblto] host_flags = " ".join(host_flags) config.substitutions.append(("%clang_host", "%clang " + host_flags)) diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h index 7b1a5f5019589..ee1f28377f7e4 100644 --- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h +++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h @@ -68,17 +68,13 @@ class BasicBlockSectionsProfileReader { BasicBlockSectionsProfileReader() = default; - // Returns true if basic block sections profile exist for function \p - // FuncName. + // Returns true if function \p FuncName is hot based on the basic block + // section profile. bool isFunctionHot(StringRef FuncName) const; - // Returns a pair with first element representing whether basic block sections - // profile exist for the function \p FuncName, and the second element - // representing the basic block sections profile (cluster info) for this - // function. If the first element is true and the second element is empty, it - // means unique basic block sections are desired for all basic blocks of the - // function. - std::pair> + // Returns the cluster info for the function \p FuncName. Returns an empty + // vector if function has no cluster info. + SmallVector getClusterInfoForFunction(StringRef FuncName) const; // Returns the path clonings for the given function. @@ -190,7 +186,7 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass { bool isFunctionHot(StringRef FuncName) const; - std::pair> + SmallVector getClusterInfoForFunction(StringRef FuncName) const; SmallVector> diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 7010cffe23a11..02cd9256c422a 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -113,15 +113,18 @@ struct ExtAddrMode { /// class LLVM_ABI TargetInstrInfo : public MCInstrInfo { protected: + const TargetRegisterInfo &TRI; + /// Subtarget specific sub-array of MCInstrInfo's RegClassByHwModeTables /// (i.e. the table for the active HwMode). This should be indexed by /// MCOperandInfo's RegClass field for LookupRegClassByHwMode operands. const int16_t *const RegClassByHwMode; - TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u, - unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u, + TargetInstrInfo(const TargetRegisterInfo &TRI, unsigned CFSetupOpcode = ~0u, + unsigned CFDestroyOpcode = ~0u, unsigned CatchRetOpcode = ~0u, + unsigned ReturnOpcode = ~0u, const int16_t *const RegClassByHwModeTable = nullptr) - : RegClassByHwMode(RegClassByHwModeTable), + : TRI(TRI), RegClassByHwMode(RegClassByHwModeTable), CallFrameSetupOpcode(CFSetupOpcode), CallFrameDestroyOpcode(CFDestroyOpcode), CatchRetOpcode(CatchRetOpcode), ReturnOpcode(ReturnOpcode) {} @@ -131,6 +134,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { TargetInstrInfo &operator=(const TargetInstrInfo &) = delete; virtual ~TargetInstrInfo(); + const TargetRegisterInfo &getRegisterInfo() const { return TRI; } + static bool isGenericOpcode(unsigned Opc) { return Opc <= TargetOpcode::GENERIC_OP_END; } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 857c5da91a9f5..48d8b695c9aa8 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -3669,7 +3669,7 @@ class OpenMPIRBuilder { /// \param Name Name of the variable. LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, - unsigned AddressSpace = 0); + std::optional AddressSpace = {}); }; /// Class to represented the control flow structure of an OpenMP canonical loop. diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index 3b3c26fee02ec..1716492e73ff9 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -406,7 +406,7 @@ class LLVM_ABI MCAsmInfo { // Generated object files can use all ELF features supported by GNU ld of // this binutils version and later. INT_MAX means all features can be used, // regardless of GNU ld support. The default value is referenced by - // clang/Driver/Options.td. + // clang/Options/Options.td. std::pair BinutilsVersion = {2, 26}; /// Should we use the integrated assembler? diff --git a/llvm/include/llvm/Option/Arg.h b/llvm/include/llvm/Option/Arg.h index b1e56b58da684..496373d28600f 100644 --- a/llvm/include/llvm/Option/Arg.h +++ b/llvm/include/llvm/Option/Arg.h @@ -51,7 +51,7 @@ class Arg { /// Was this argument used to affect compilation? /// /// This is used to generate an "argument unused" warning (without - /// clang::driver::options::TargetSpecific) or "unsupported option" error + /// clang::options::TargetSpecific) or "unsupported option" error /// (with TargetSpecific). mutable unsigned Claimed : 1; diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h index cb8e568de02e0..dee4db584c88b 100644 --- a/llvm/include/llvm/Support/SpecialCaseList.h +++ b/llvm/include/llvm/Support/SpecialCaseList.h @@ -126,15 +126,16 @@ class SpecialCaseList { SpecialCaseList &operator=(SpecialCaseList const &) = delete; private: + using Match = std::pair; + static constexpr Match NotMatched = {"", 0}; + // Lagacy v1 matcher. class RegexMatcher { public: LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); LLVM_ABI void preprocess(bool BySize); - LLVM_ABI void - match(StringRef Query, - llvm::function_ref Cb) const; + LLVM_ABI Match match(StringRef Query) const; struct Reg { Reg(StringRef Name, unsigned LineNo, Regex &&Rg) @@ -152,9 +153,7 @@ class SpecialCaseList { LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); LLVM_ABI void preprocess(bool BySize); - LLVM_ABI void - match(StringRef Query, - llvm::function_ref Cb) const; + LLVM_ABI Match match(StringRef Query) const; struct Glob { Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern) @@ -168,11 +167,10 @@ class SpecialCaseList { RadixTree, RadixTree, - SmallVector>> + SmallVector>> PrefixSuffixToGlob; - RadixTree, - SmallVector> + RadixTree, SmallVector> SubstrToGlob; }; @@ -184,14 +182,10 @@ class SpecialCaseList { LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); LLVM_ABI void preprocess(bool BySize); - LLVM_ABI void - match(StringRef Query, - llvm::function_ref Cb) const; + LLVM_ABI Match match(StringRef Query) const; LLVM_ABI bool matchAny(StringRef Query) const { - bool R = false; - match(Query, [&](StringRef, unsigned) { R = true; }); - return R; + return match(Query) != NotMatched; } std::variant M; @@ -201,17 +195,22 @@ class SpecialCaseList { using SectionEntries = StringMap>; protected: - struct Section { + class Section { + public: Section(StringRef Str, unsigned FileIdx, bool UseGlobs) : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false), SectionStr(Str), FileIdx(FileIdx) {} Section(Section &&) = default; - Matcher SectionMatcher; - SectionEntries Entries; - std::string SectionStr; - unsigned FileIdx; + // Return name of the section, its entire string in []. + StringRef name() const { return SectionStr; } + + // Returns true if string 'Name' matches section name interpreted as a glob. + LLVM_ABI bool matchName(StringRef Name) const; + + // Return sequence number of the file where this section is defined. + unsigned fileIndex() const { return FileIdx; } // Helper method to search by Prefix, Query, and Category. Returns // 1-based line number on which rule is defined, or 0 if there is no match. @@ -223,11 +222,19 @@ class SpecialCaseList { LLVM_ABI StringRef getLongestMatch(StringRef Prefix, StringRef Query, StringRef Category) const; + /// Returns true if the section has any entries for the given prefix. + LLVM_ABI bool hasPrefix(StringRef Prefix) const; + private: friend class SpecialCaseList; LLVM_ABI void preprocess(bool OrderBySize); LLVM_ABI const SpecialCaseList::Matcher * findMatcher(StringRef Prefix, StringRef Category) const; + + Matcher SectionMatcher; + std::string SectionStr; + SectionEntries Entries; + unsigned FileIdx; }; ArrayRef sections() const { return Sections; } diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp index e317e1c06741f..52e2909bec072 100644 --- a/llvm/lib/CodeGen/BasicBlockSections.cpp +++ b/llvm/lib/CodeGen/BasicBlockSections.cpp @@ -183,8 +183,7 @@ updateBranches(MachineFunction &MF, // clusters are ordered in increasing order of their IDs, with the "Exception" // and "Cold" succeeding all other clusters. // FuncClusterInfo represents the cluster information for basic blocks. It -// maps from BBID of basic blocks to their cluster information. If this is -// empty, it means unique sections for all basic blocks in the function. +// maps from BBID of basic blocks to their cluster information. static void assignSections(MachineFunction &MF, const DenseMap &FuncClusterInfo) { @@ -197,10 +196,8 @@ assignSections(MachineFunction &MF, for (auto &MBB : MF) { // With the 'all' option, every basic block is placed in a unique section. // With the 'list' option, every basic block is placed in a section - // associated with its cluster, unless we want individual unique sections - // for every basic block in this function (if FuncClusterInfo is empty). - if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All || - FuncClusterInfo.empty()) { + // associated with its cluster. + if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) { // If unique sections are desired for all basic blocks of the function, we // set every basic block's section ID equal to its original position in // the layout (which is equal to its number). This ensures that basic @@ -308,22 +305,22 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) { if (BBSectionsType == BasicBlockSection::List && hasInstrProfHashMismatch(MF)) return false; - // Renumber blocks before sorting them. This is useful for accessing the - // original layout positions and finding the original fallthroughs. - MF.RenumberBlocks(); DenseMap FuncClusterInfo; if (BBSectionsType == BasicBlockSection::List) { - auto [HasProfile, ClusterInfo] = - getAnalysis() - .getClusterInfoForFunction(MF.getName()); - if (!HasProfile) + auto ClusterInfo = getAnalysis() + .getClusterInfoForFunction(MF.getName()); + if (ClusterInfo.empty()) return false; for (auto &BBClusterInfo : ClusterInfo) { FuncClusterInfo.try_emplace(BBClusterInfo.BBID, BBClusterInfo); } } + // Renumber blocks before sorting them. This is useful for accessing the + // original layout positions and finding the original fallthroughs. + MF.RenumberBlocks(); + MF.setBBSectionsType(BBSectionsType); assignSections(MF, FuncClusterInfo); diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index 485b44ae4c4aa..c234c0f1b0b34 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -58,22 +58,24 @@ BasicBlockSectionsProfileReader::parseUniqueBBID(StringRef S) const { } bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const { - return getClusterInfoForFunction(FuncName).first; + return !getClusterInfoForFunction(FuncName).empty(); } -std::pair> +SmallVector BasicBlockSectionsProfileReader::getClusterInfoForFunction( StringRef FuncName) const { auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); - return R != ProgramPathAndClusterInfo.end() - ? std::pair(true, R->second.ClusterInfo) - : std::pair(false, SmallVector()); + return R != ProgramPathAndClusterInfo.end() ? R->second.ClusterInfo + : SmallVector(); } SmallVector> BasicBlockSectionsProfileReader::getClonePathsForFunction( StringRef FuncName) const { - return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName)).ClonePaths; + auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); + return R != ProgramPathAndClusterInfo.end() + ? R->second.ClonePaths + : SmallVector>(); } uint64_t BasicBlockSectionsProfileReader::getEdgeCount( @@ -494,7 +496,7 @@ bool BasicBlockSectionsProfileReaderWrapperPass::isFunctionHot( return BBSPR.isFunctionHot(FuncName); } -std::pair> +SmallVector BasicBlockSectionsProfileReaderWrapperPass::getClusterInfoForFunction( StringRef FuncName) const { return BBSPR.getClusterInfoForFunction(FuncName); diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 3c41bbeb4b327..7c89e51ff86d3 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -60,7 +60,7 @@ TargetInstrInfo::~TargetInstrInfo() = default; const TargetRegisterClass * TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { + const TargetRegisterInfo * /*RemoveMe*/) const { if (OpNum >= MCID.getNumOperands()) return nullptr; @@ -69,14 +69,14 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, // TODO: Remove isLookupPtrRegClass in favor of isLookupRegClassByHwMode if (OpInfo.isLookupPtrRegClass()) - return TRI->getPointerRegClass(RegClass); + return TRI.getPointerRegClass(RegClass); // Instructions like INSERT_SUBREG do not have fixed register classes. if (RegClass < 0) return nullptr; // Otherwise just look it up normally. - return TRI->getRegClass(RegClass); + return TRI.getRegClass(RegClass); } /// insertNoop - Insert a noop into the instruction stream at the specified @@ -223,13 +223,11 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI, // %1.sub = INST %1.sub(tied), %0.sub, implicit-def %1 SmallVector UpdateImplicitDefIdx; if (HasDef && MI.hasImplicitDef()) { - const TargetRegisterInfo *TRI = - MI.getMF()->getSubtarget().getRegisterInfo(); for (auto [OpNo, MO] : llvm::enumerate(MI.implicit_operands())) { Register ImplReg = MO.getReg(); if ((ImplReg.isVirtual() && ImplReg == Reg0) || (ImplReg.isPhysical() && Reg0.isPhysical() && - TRI->isSubRegisterEq(ImplReg, Reg0))) + TRI.isSubRegisterEq(ImplReg, Reg0))) UpdateImplicitDefIdx.push_back(OpNo + MI.getNumExplicitOperands()); } } @@ -425,37 +423,35 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx, unsigned &Size, unsigned &Offset, const MachineFunction &MF) const { - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!SubIdx) { - Size = TRI->getSpillSize(*RC); + Size = TRI.getSpillSize(*RC); Offset = 0; return true; } - unsigned BitSize = TRI->getSubRegIdxSize(SubIdx); + unsigned BitSize = TRI.getSubRegIdxSize(SubIdx); // Convert bit size to byte size. if (BitSize % 8) return false; - int BitOffset = TRI->getSubRegIdxOffset(SubIdx); + int BitOffset = TRI.getSubRegIdxOffset(SubIdx); if (BitOffset < 0 || BitOffset % 8) return false; Size = BitSize / 8; Offset = (unsigned)BitOffset / 8; - assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); + assert(TRI.getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); if (!MF.getDataLayout().isLittleEndian()) { - Offset = TRI->getSpillSize(*RC) - (Offset + Size); + Offset = TRI.getSpillSize(*RC) - (Offset + Size); } return true; } -void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const { +void TargetInstrInfo::reMaterialize( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, + unsigned SubIdx, const MachineInstr &Orig, + const TargetRegisterInfo & /*Remove me*/) const { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI); MBB.insert(I, MI); @@ -726,7 +722,6 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, // actual load size is. int64_t MemSize = 0; const MachineFrameInfo &MFI = MF.getFrameInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (Flags & MachineMemOperand::MOStore) { MemSize = MFI.getObjectSize(FI); @@ -735,7 +730,7 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, int64_t OpSize = MFI.getObjectSize(FI); if (auto SubReg = MI.getOperand(OpIdx).getSubReg()) { - unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg); + unsigned SubRegSize = TRI.getSubRegIdxSize(SubReg); if (SubRegSize > 0 && !(SubRegSize % 8)) OpSize = SubRegSize / 8; } @@ -800,11 +795,11 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, // code. BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO); } else { - storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI, + storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, &TRI, Register()); } } else - loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register()); + loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, &TRI, Register()); return &*--Pos; } @@ -880,8 +875,8 @@ static void transferImplicitOperands(MachineInstr *MI, } } -void TargetInstrInfo::lowerCopy(MachineInstr *MI, - const TargetRegisterInfo *TRI) const { +void TargetInstrInfo::lowerCopy( + MachineInstr *MI, const TargetRegisterInfo * /*Remove me*/) const { if (MI->allDefsAreDead()) { MI->setDesc(get(TargetOpcode::KILL)); return; @@ -911,7 +906,7 @@ void TargetInstrInfo::lowerCopy(MachineInstr *MI, SrcMO.getReg().isPhysical() ? SrcMO.isRenamable() : false); if (MI->getNumOperands() > 2) - transferImplicitOperands(MI, TRI); + transferImplicitOperands(MI, &TRI); MI->eraseFromParent(); } @@ -1327,8 +1322,7 @@ void TargetInstrInfo::reassociateOps( MachineFunction *MF = Root.getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, &TRI); MachineOperand &OpA = Prev.getOperand(OperandIndices[1]); MachineOperand &OpB = Root.getOperand(OperandIndices[2]); @@ -1704,8 +1698,7 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // stack slot reference to depend on the instruction that does the // modification. const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI); + return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), &TRI); } // Provide a global flag for disabling the PreRA hazard recognizer that targets @@ -1738,11 +1731,11 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, // Default implementation of getMemOperandWithOffset. bool TargetInstrInfo::getMemOperandWithOffset( const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, - bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { + bool &OffsetIsScalable, const TargetRegisterInfo * /*RemoveMe*/) const { SmallVector BaseOps; LocationSize Width = LocationSize::precise(0); if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable, - Width, TRI) || + Width, &TRI) || BaseOps.size() != 1) return false; BaseOp = BaseOps.front(); @@ -1863,7 +1856,6 @@ std::optional TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const { const MachineFunction *MF = MI.getMF(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {}); int64_t Offset; bool OffsetIsScalable; @@ -1894,7 +1886,6 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, // Only describe memory which provably does not escape the function. As // described in llvm.org/PR43343, escaped memory may be clobbered by the // callee (or by another thread). - const auto &TII = MF->getSubtarget().getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); const MachineMemOperand *MMO = MI.memoperands()[0]; const PseudoSourceValue *PSV = MMO->getPseudoValue(); @@ -1905,8 +1896,7 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, return std::nullopt; const MachineOperand *BaseOp; - if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, - TRI)) + if (!getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, &TRI)) return std::nullopt; // FIXME: Scalable offsets are not yet handled in the offset code below. @@ -2045,7 +2035,7 @@ bool TargetInstrInfo::getInsertSubregInputs( // Returns a MIRPrinter comment for this machine operand. std::string TargetInstrInfo::createMIROperandComment( const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx, - const TargetRegisterInfo *TRI) const { + const TargetRegisterInfo * /*RemoveMe*/) const { if (!MI.isInlineAsm()) return ""; @@ -2078,12 +2068,8 @@ std::string TargetInstrInfo::createMIROperandComment( OS << F.getKindName(); unsigned RCID; - if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) { - if (TRI) { - OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID)); - } else - OS << ":RC" << RCID; - } + if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) + OS << ':' << TRI.getRegClassName(TRI.getRegClass(RCID)); if (F.isMemKind()) { InlineAsm::ConstraintCode MCID = F.getMemoryConstraintID(); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 377e180d1d044..2a43961fa167e 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -8567,9 +8567,8 @@ OpenMPIRBuilder::createPlatformSpecificName(ArrayRef Parts) const { Config.separator()); } -GlobalVariable * -OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name, - unsigned AddressSpace) { +GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable( + Type *Ty, const StringRef &Name, std::optional AddressSpace) { auto &Elem = *InternalVars.try_emplace(Name, nullptr).first; if (Elem.second) { assert(Elem.second->getValueType() == Ty && @@ -8579,16 +8578,18 @@ OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name, // variable for possibly changing that to internal or private, or maybe // create different versions of the function for different OMP internal // variables. + const DataLayout &DL = M.getDataLayout(); + unsigned AddressSpaceVal = + AddressSpace ? *AddressSpace : DL.getDefaultGlobalsAddressSpace(); auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32 ? GlobalValue::InternalLinkage : GlobalValue::CommonLinkage; auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage, Constant::getNullValue(Ty), Elem.first(), /*InsertBefore=*/nullptr, - GlobalValue::NotThreadLocal, AddressSpace); - const DataLayout &DL = M.getDataLayout(); + GlobalValue::NotThreadLocal, AddressSpaceVal); const llvm::Align TypeAlign = DL.getABITypeAlign(Ty); - const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace); + const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal); GV->setAlignment(std::max(TypeAlign, PtrAlign)); Elem.second = GV; } diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 246d90cce3a43..beec8b8ef0d5b 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -15,11 +15,13 @@ #include "llvm/Support/SpecialCaseList.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -63,12 +65,12 @@ void SpecialCaseList::RegexMatcher::preprocess(bool BySize) { } } -void SpecialCaseList::RegexMatcher::match( - StringRef Query, - llvm::function_ref Cb) const { +SpecialCaseList::Match +SpecialCaseList::RegexMatcher::match(StringRef Query) const { for (const auto &R : reverse(RegExes)) if (R.Rg.match(Query)) - return Cb(R.Name, R.LineNo); + return {R.Name, R.LineNo}; + return NotMatched; } Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern, @@ -90,7 +92,7 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { }); } - for (const auto &G : reverse(Globs)) { + for (const auto &[Idx, G] : enumerate(Globs)) { StringRef Prefix = G.Pattern.prefix(); StringRef Suffix = G.Pattern.suffix(); @@ -102,26 +104,29 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { // But only if substring is not empty. Searching this tree is more // expensive. auto &V = SubstrToGlob.emplace(Substr).first->second; - V.emplace_back(&G); + V.emplace_back(Idx); continue; } } auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second; auto &V = SToGlob.emplace(reverse(Suffix)).first->second; - V.emplace_back(&G); + V.emplace_back(Idx); } } -void SpecialCaseList::GlobMatcher::match( - StringRef Query, - llvm::function_ref Cb) const { +SpecialCaseList::Match +SpecialCaseList::GlobMatcher::match(StringRef Query) const { + int Best = -1; if (!PrefixSuffixToGlob.empty()) { for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) { for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) { - for (const auto *G : V) { - if (G->Pattern.match(Query)) { - Cb(G->Name, G->LineNo); + for (int Idx : reverse(V)) { + if (Best > Idx) + break; + const GlobMatcher::Glob &G = Globs[Idx]; + if (G.Pattern.match(Query)) { + Best = Idx; // As soon as we find a match in the vector, we can break for this // vector, since the globs are already sorted by priority within the // prefix group. However, we continue searching other prefix groups @@ -138,9 +143,12 @@ void SpecialCaseList::GlobMatcher::match( // possibilities. In most cases search will fail on first characters. for (StringRef Q = Query; !Q.empty(); Q = Q.drop_front()) { for (const auto &[_, V] : SubstrToGlob.find_prefixes(Q)) { - for (const auto *G : V) { - if (G->Pattern.match(Query)) { - Cb(G->Name, G->LineNo); + for (int Idx : reverse(V)) { + if (Best > Idx) + break; + const GlobMatcher::Glob &G = Globs[Idx]; + if (G.Pattern.match(Query)) { + Best = Idx; // As soon as we find a match in the vector, we can break for this // vector, since the globs are already sorted by priority within the // prefix group. However, we continue searching other prefix groups @@ -151,6 +159,9 @@ void SpecialCaseList::GlobMatcher::match( } } } + if (Best < 0) + return NotMatched; + return {Globs[Best].Name, Globs[Best].LineNo}; } SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) @@ -169,12 +180,11 @@ void SpecialCaseList::Matcher::preprocess(bool BySize) { return std::visit([&](auto &V) { return V.preprocess(BySize); }, M); } -void SpecialCaseList::Matcher::match( - StringRef Query, - llvm::function_ref Cb) const { +SpecialCaseList::Match SpecialCaseList::Matcher::match(StringRef Query) const { if (RemoveDotSlash) Query = llvm::sys::path::remove_leading_dotslash(Query); - return std::visit([&](auto &V) { return V.match(Query, Cb); }, M); + return std::visit( + [&](auto &V) -> SpecialCaseList::Match { return V.match(Query); }, M); } // TODO: Refactor this to return Expected<...> @@ -264,11 +274,12 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, bool RemoveDotSlash = Version > 2; - Section *CurrentSection; - if (auto Err = addSection("*", FileIdx, 1, true).moveInto(CurrentSection)) { + auto ErrOrSection = addSection("*", FileIdx, 1, true); + if (auto Err = ErrOrSection.takeError()) { Error = toString(std::move(Err)); return false; } + Section *CurrentSection = ErrOrSection.get(); // This is the current list of prefixes for all existing users matching file // path. We may need parametrization in constructor in future. @@ -290,12 +301,13 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, return false; } - if (auto Err = addSection(Line.drop_front().drop_back(), FileIdx, LineNo, - UseGlobs) - .moveInto(CurrentSection)) { + auto ErrOrSection = + addSection(Line.drop_front().drop_back(), FileIdx, LineNo, UseGlobs); + if (auto Err = ErrOrSection.takeError()) { Error = toString(std::move(Err)); return false; } + CurrentSection = ErrOrSection.get(); continue; } @@ -348,6 +360,10 @@ SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix, return NotFound; } +bool SpecialCaseList::Section::matchName(StringRef Name) const { + return SectionMatcher.matchAny(Name); +} + const SpecialCaseList::Matcher * SpecialCaseList::Section::findMatcher(StringRef Prefix, StringRef Category) const { @@ -371,26 +387,21 @@ LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) { unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix, StringRef Query, StringRef Category) const { - unsigned LastLine = 0; - if (const Matcher *M = findMatcher(Prefix, Category)) { - M->match(Query, [&](StringRef, unsigned LineNo) { - LastLine = std::max(LastLine, LineNo); - }); - } - return LastLine; + if (const Matcher *M = findMatcher(Prefix, Category)) + return M->match(Query).second; + return 0; } StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix, StringRef Query, StringRef Category) const { - StringRef LongestRule; - if (const Matcher *M = findMatcher(Prefix, Category)) { - M->match(Query, [&](StringRef Rule, unsigned) { - if (LongestRule.size() < Rule.size()) - LongestRule = Rule; - }); - } - return LongestRule; + if (const Matcher *M = findMatcher(Prefix, Category)) + return M->match(Query).first; + return {}; +} + +bool SpecialCaseList::Section::hasPrefix(StringRef Prefix) const { + return Entries.find(Prefix) != Entries.end(); } } // namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 55a04cca4c394..66e49491b18e0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -91,7 +91,7 @@ static cl::opt GatherOptSearchLimit( "machine-combiner gather pattern optimization")); AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) - : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN, + : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, AArch64::CATCHRET), RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {} @@ -1780,6 +1780,16 @@ static unsigned sForm(MachineInstr &Instr) { case AArch64::SUBSWri: case AArch64::SUBSXrr: case AArch64::SUBSXri: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::BICSWrs: + case AArch64::BICSXrs: return Instr.getOpcode(); case AArch64::ADDWrr: @@ -1810,6 +1820,22 @@ static unsigned sForm(MachineInstr &Instr) { return AArch64::ANDSWri; case AArch64::ANDXri: return AArch64::ANDSXri; + case AArch64::ANDWrr: + return AArch64::ANDSWrr; + case AArch64::ANDWrs: + return AArch64::ANDSWrs; + case AArch64::ANDXrr: + return AArch64::ANDSXrr; + case AArch64::ANDXrs: + return AArch64::ANDSXrs; + case AArch64::BICWrr: + return AArch64::BICSWrr; + case AArch64::BICXrr: + return AArch64::BICSXrr; + case AArch64::BICWrs: + return AArch64::BICSWrs; + case AArch64::BICXrs: + return AArch64::BICSXrs; } } @@ -1947,6 +1973,25 @@ static bool isSUBSRegImm(unsigned Opcode) { return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; } +static bool isANDOpcode(MachineInstr &MI) { + unsigned Opc = sForm(MI); + switch (Opc) { + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + return true; + default: + return false; + } +} + /// Check if CmpInstr can be substituted by MI. /// /// CmpInstr can be substituted: @@ -1984,7 +2029,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, // 1) MI and CmpInstr set N and V to the same value. // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when // signed overflow occurs, so CmpInstr could still be simplified away. - if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) + // Note that Ands and Bics instructions always clear the V flag. + if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI)) return false; AccessKind AccessToCheck = AK_Write; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 8974965c41fe3..ab4004e30f629 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -157,7 +157,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { return usePostRAScheduler(); } bool enableSubRegLiveness() const override { return EnableSubregLiveness; } - + bool enableTerminalRule() const override { return true; } bool enableMachinePipeliner() const override; bool useDFAforSMS() const override { return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c1ee3a2ac6a89..125e212a1b946 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5081,17 +5081,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned MinNumRegsRequired = DstSize / 32; const SIMachineFunctionInfo *Info = MF.getInfo(); + bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired); + OpdsMapping[0] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 0ea9add891111..b03d50f2d451d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -261,13 +261,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( const Function *Callee = getCalleeFunction(*CalleeOp); - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - auto isSameFunction = [](const MachineFunction &MF, const Function *F) { return F == &MF.getFunction(); }; diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 3e256cce97afb..01040854e1577 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -29,7 +29,7 @@ using namespace llvm; #include "R600GenInstrInfo.inc" R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) - : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {} + : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 89517a8f2d78c..644a410ca5ec2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,7 +63,8 @@ static cl::opt Fix16BitCopies( cl::ReallyHidden); SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) - : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP, + AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { SchedModel.init(&ST); } diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 54f57e02ed47e..85adcab55b742 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -513,6 +513,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16", defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>; + +let HasClamp = 0, HasOMod = 0 in { +def V_TRANS_BF16_Profile : VOPProfile <[bf16, bf16, untyped, untyped]>; +def V_TRANS_BF16_t16_Profile : VOPProfile_True16 ; +def V_TRANS_BF16_fake16_Profile : VOPProfile_Fake16 ; +} + let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; @@ -527,14 +534,30 @@ defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; } let SubtargetPredicate = HasBF16TransInsts in { -defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; -defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; -defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; -defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; -defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; -defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; -defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; -defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; +defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + int_amdgcn_tanh>; +defm V_RCP_BF16 : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrcp>; +defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUcos>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 4e63ad54bef27..94bb60861d670 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1357,8 +1357,12 @@ class VOPBinOpClampPat : class getVOP3ModPat { dag src0 = !if(P.HasOMod, - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)), + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers))); list ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp.ret (P.Src0VT src0), diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp index 05bcb3596ac48..2dec6ffd89a75 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp +++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp @@ -44,7 +44,8 @@ enum TSFlagsConstants { void ARCInstrInfo::anchor() {} ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST) - : ARCGenInstrInfo(ST, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {} + : ARCGenInstrInfo(ST, RI, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), + RI(ST) {} static bool isZeroImm(const MachineOperand &Op) { return Op.isImm() && Op.getImm() == 0; diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 22769dbf38719..b466ca6f19b3e 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -107,8 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = { { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true }, }; -ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI) - : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), +ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI, + const ARMBaseRegisterInfo &TRI) + : ARMGenInstrInfo(STI, TRI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), Subtarget(STI) { for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) { if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 2869e7f708046..27f8e3b3170e7 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -44,7 +44,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { protected: // Can be only subclassed. - explicit ARMBaseInstrInfo(const ARMSubtarget &STI); + explicit ARMBaseInstrInfo(const ARMSubtarget &STI, + const ARMBaseRegisterInfo &TRI); void expandLoadStackGuardBase(MachineBasicBlock::iterator MI, unsigned LoadImmOpc, unsigned LoadOpc) const; @@ -125,7 +126,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { // if there is not such an opcode. virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0; - virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0; + const ARMBaseRegisterInfo &getRegisterInfo() const { + return static_cast( + TargetInstrInfo::getRegisterInfo()); + } + const ARMSubtarget &getSubtarget() const { return Subtarget; } ScheduleHazardRecognizer * diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp index c684de7252e5d..f37054736b730 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -25,7 +25,8 @@ #include "llvm/MC/MCInst.h" using namespace llvm; -ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {} +ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI, RI) {} /// Return the noop instruction to use for a noop. MCInst ARMInstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.h b/llvm/lib/Target/ARM/ARMInstrInfo.h index 178d7a2c630e4..9feaf1440f2b2 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -35,7 +35,7 @@ class ARMInstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ARMRegisterInfo &getRegisterInfo() const override { return RI; } + const ARMRegisterInfo &getRegisterInfo() const { return RI; } private: void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override; diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 4a0883cc662e7..34baa3108402c 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -377,6 +377,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool isRWPI() const; bool useMachineScheduler() const { return UseMISched; } + bool enableTerminalRule() const override { return true; } bool useMachinePipeliner() const { return UseMIPipeliner; } bool hasMinSize() const { return OptMinSize; } bool isThumb1Only() const { return isThumb() && !hasThumb2(); } diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 4b8c2fd569ead..f95ba6a4d86fb 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) {} + : ARMBaseInstrInfo(STI, RI), RI(STI) {} /// Return the noop instruction to use for a noop. MCInst Thumb1InstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h index 68b326c0ebef6..16350a65a6198 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -35,7 +35,7 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const { return RI; } void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index f5653d459eac8..b66e4075d4cfa 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -46,7 +46,7 @@ PreferNoCSEL("prefer-no-csel", cl::Hidden, cl::init(false)); Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) {} + : ARMBaseInstrInfo(STI, RI), RI(STI) {} /// Return the noop instruction to use for a noop. MCInst Thumb2InstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 1b0bf2d499510..59ef39d442aef 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -58,7 +58,7 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const { return RI; } MachineInstr *optimizeSelect(MachineInstr &MI, SmallPtrSetImpl &SeenMIs, diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp index ce9908597dcac..5e247cb64a991 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp +++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp @@ -30,8 +30,8 @@ namespace llvm { AVRInstrInfo::AVRInstrInfo(const AVRSubtarget &STI) - : AVRGenInstrInfo(STI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(), - STI(STI) {} + : AVRGenInstrInfo(STI, RI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), + RI(), STI(STI) {} void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp index 409f8b4c253b8..0e56e658952a2 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp +++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp @@ -27,7 +27,7 @@ using namespace llvm; BPFInstrInfo::BPFInstrInfo(const BPFSubtarget &STI) - : BPFGenInstrInfo(STI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} + : BPFGenInstrInfo(STI, RI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp index 619a797be6dc7..34a7de8d8ae96 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp @@ -25,7 +25,7 @@ using namespace llvm; #include "CSKYGenInstrInfo.inc" CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI) - : CSKYGenInstrInfo(STI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), + : CSKYGenInstrInfo(STI, RI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) { v2sf = STI.hasFPUv2SingleFloat(); v2df = STI.hasFPUv2DoubleFloat(); diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp index 6579d3405cf39..057d87bc3c6a9 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp @@ -10,6 +10,7 @@ #include "DirectX.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DXILResource.h" +#include "llvm/Frontend/HLSL/HLSLResource.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -20,6 +21,7 @@ #include "llvm/IR/IntrinsicsDirectX.h" #include "llvm/IR/User.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Transforms/Utils/ValueMapper.h" #define DEBUG_TYPE "dxil-resource-access" @@ -44,16 +46,28 @@ static Value *calculateGEPOffset(GetElementPtrInst *GEP, Value *PrevOffset, APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0); if (GEP->accumulateConstantOffset(DL, ConstantOffset)) { APInt Scaled = ConstantOffset.udiv(ScalarSize); - return ConstantInt::get(Type::getInt32Ty(GEP->getContext()), Scaled); + return ConstantInt::get(DL.getIndexType(GEP->getType()), Scaled); } - auto IndexIt = GEP->idx_begin(); - assert(cast(IndexIt)->getZExtValue() == 0 && - "GEP is not indexing through pointer"); - ++IndexIt; - Value *Offset = *IndexIt; - assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP"); - return Offset; + unsigned NumIndices = GEP->getNumIndices(); + + // If we have a single index we're indexing into a top level array. This + // generally only happens with cbuffers. + if (NumIndices == 1) + return *GEP->idx_begin(); + + // If we have two indices, this should be a simple access through a pointer. + if (NumIndices == 2) { + auto IndexIt = GEP->idx_begin(); + assert(cast(IndexIt)->getZExtValue() == 0 && + "GEP is not indexing through pointer"); + ++IndexIt; + Value *Offset = *IndexIt; + assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP"); + return Offset; + } + + llvm_unreachable("Unhandled GEP structure for resource access"); } static void createTypedBufferStore(IntrinsicInst *II, StoreInst *SI, @@ -171,6 +185,127 @@ static void createRawLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset) { LI->replaceAllUsesWith(V); } +namespace { +/// Helper for building a `load.cbufferrow` intrinsic given a simple type. +struct CBufferRowIntrin { + Intrinsic::ID IID; + Type *RetTy; + unsigned int EltSize; + unsigned int NumElts; + + CBufferRowIntrin(const DataLayout &DL, Type *Ty) { + assert(Ty == Ty->getScalarType() && "Expected scalar type"); + + switch (DL.getTypeSizeInBits(Ty)) { + case 16: + IID = Intrinsic::dx_resource_load_cbufferrow_8; + RetTy = StructType::get(Ty, Ty, Ty, Ty, Ty, Ty, Ty, Ty); + EltSize = 2; + NumElts = 8; + break; + case 32: + IID = Intrinsic::dx_resource_load_cbufferrow_4; + RetTy = StructType::get(Ty, Ty, Ty, Ty); + EltSize = 4; + NumElts = 4; + break; + case 64: + IID = Intrinsic::dx_resource_load_cbufferrow_2; + RetTy = StructType::get(Ty, Ty); + EltSize = 8; + NumElts = 2; + break; + default: + llvm_unreachable("Only 16, 32, and 64 bit types supported"); + } + } +}; +} // namespace + +static void createCBufferLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset, + dxil::ResourceTypeInfo &RTI) { + const DataLayout &DL = LI->getDataLayout(); + + Type *Ty = LI->getType(); + assert(!isa(Ty) && "Structs not handled yet"); + CBufferRowIntrin Intrin(DL, Ty->getScalarType()); + + StringRef Name = LI->getName(); + Value *Handle = II->getOperand(0); + + IRBuilder<> Builder(LI); + + ConstantInt *GlobalOffset = dyn_cast(II->getOperand(1)); + assert(GlobalOffset && "CBuffer getpointer index must be constant"); + + unsigned int FixedOffset = GlobalOffset->getZExtValue(); + // If we have a further constant offset we can just fold it in to the fixed + // offset. + if (auto *ConstOffset = dyn_cast_if_present(Offset)) { + FixedOffset += ConstOffset->getZExtValue(); + Offset = nullptr; + } + + Value *CurrentRow = ConstantInt::get( + Builder.getInt32Ty(), FixedOffset / hlsl::CBufferRowSizeInBytes); + unsigned int CurrentIndex = + (FixedOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize; + + assert(!(CurrentIndex && Offset) && + "Dynamic indexing into elements of cbuffer rows is not supported"); + // At this point if we have a non-constant offset it has to be an array + // offset, so we can assume that it's a multiple of the row size. + if (Offset) + CurrentRow = FixedOffset ? Builder.CreateAdd(CurrentRow, Offset) : Offset; + + auto *CBufLoad = Builder.CreateIntrinsic( + Intrin.RetTy, Intrin.IID, {Handle, CurrentRow}, nullptr, Name + ".load"); + auto *Elt = + Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, Name + ".extract"); + + // At this point we've loaded the first scalar of our result, but our original + // type may have been a vector. + unsigned int Remaining = + ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1; + if (Remaining == 0) { + // We only have a single element, so we're done. + Value *Result = Elt; + + // However, if we loaded a <1 x T>, then we need to adjust the type. + if (auto *VT = dyn_cast(Ty)) { + assert(VT->getNumElements() == 1 && "Can't have multiple elements here"); + Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result, + Builder.getInt32(0), Name); + } + LI->replaceAllUsesWith(Result); + return; + } + + // Walk each element and extract it, wrapping to new rows as needed. + SmallVector Extracts{Elt}; + while (Remaining--) { + CurrentIndex %= Intrin.NumElts; + + if (CurrentIndex == 0) { + CurrentRow = Builder.CreateAdd(CurrentRow, + ConstantInt::get(Builder.getInt32Ty(), 1)); + CBufLoad = Builder.CreateIntrinsic(Intrin.RetTy, Intrin.IID, + {Handle, CurrentRow}, nullptr, + Name + ".load"); + } + + Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, + Name + ".extract")); + } + + // Finally, we build up the original loaded value. + Value *Result = PoisonValue::get(Ty); + for (int I = 0, E = Extracts.size(); I < E; ++I) + Result = Builder.CreateInsertElement( + Result, Extracts[I], Builder.getInt32(I), Name + formatv(".upto{}", I)); + LI->replaceAllUsesWith(Result); +} + static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI) { switch (RTI.getResourceKind()) { @@ -179,6 +314,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, case dxil::ResourceKind::RawBuffer: case dxil::ResourceKind::StructuredBuffer: return createRawLoad(II, LI, Offset); + case dxil::ResourceKind::CBuffer: + return createCBufferLoad(II, LI, Offset, RTI); case dxil::ResourceKind::Texture1D: case dxil::ResourceKind::Texture2D: case dxil::ResourceKind::Texture2DMS: @@ -190,9 +327,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, case dxil::ResourceKind::TextureCubeArray: case dxil::ResourceKind::FeedbackTexture2D: case dxil::ResourceKind::FeedbackTexture2DArray: - case dxil::ResourceKind::CBuffer: case dxil::ResourceKind::TBuffer: - // TODO: handle these + reportFatalUsageError("Load not yet implemented for resource type"); return; case dxil::ResourceKind::Sampler: case dxil::ResourceKind::RTAccelerationStructure: diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp index bb2efa43d818c..401881d6d0f67 100644 --- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp @@ -19,6 +19,6 @@ using namespace llvm; DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI) - : DirectXGenInstrInfo(STI) {} + : DirectXGenInstrInfo(STI, RI) {} DirectXInstrInfo::~DirectXInstrInfo() {} diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 55bafdea234fd..dd9f2fa5cf342 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -118,9 +118,9 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768; void HexagonInstrInfo::anchor() {} HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST) - : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN, + : HexagonGenInstrInfo(ST, RegInfo, Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP), - Subtarget(ST) {} + RegInfo(ST.getHwMode()), Subtarget(ST) {} namespace llvm { namespace HexagonFUnits { diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index 48adf82833f51..7a0c77cc3f705 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -23,6 +23,8 @@ #include #include +#include "HexagonRegisterInfo.h" + #define GET_INSTRINFO_HEADER #include "HexagonGenInstrInfo.inc" @@ -36,6 +38,7 @@ class MachineOperand; class TargetRegisterInfo; class HexagonInstrInfo : public HexagonGenInstrInfo { + const HexagonRegisterInfo RegInfo; const HexagonSubtarget &Subtarget; enum BundleAttribute { @@ -47,6 +50,8 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { public: explicit HexagonInstrInfo(const HexagonSubtarget &ST); + const HexagonRegisterInfo &getRegisterInfo() const { return RegInfo; } + /// TargetInstrInfo overrides. /// If the specified machine instruction is a direct diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index a3c8a882c0616..66c8b0a67169d 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -76,8 +76,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, OptLevel(TM.getOptLevel()), CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), - RegInfo(getHwMode()), TLInfo(TM, *this), - InstrItins(getInstrItineraryForCPU(CPUString)) { + TLInfo(TM, *this), InstrItins(getInstrItineraryForCPU(CPUString)) { Hexagon_MC::addArchSubtarget(this, FS); // Beware of the default constructor of InstrItineraryData: it will // reset all members to 0. diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 995f66d0551b4..30794f61218a1 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -100,7 +100,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { // The following objects can use the TargetTriple, so they must be // declared after it. HexagonInstrInfo InstrInfo; - HexagonRegisterInfo RegInfo; HexagonTargetLowering TLInfo; HexagonSelectionDAGInfo TSInfo; HexagonFrameLowering FrameLowering; @@ -122,7 +121,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { } const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; } const HexagonRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const HexagonTargetLowering *getTargetLowering() const override { return &TLInfo; diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp index 02ed1001cd0d3..b3d28565d4f06 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -27,7 +27,8 @@ using namespace llvm; #include "LanaiGenInstrInfo.inc" LanaiInstrInfo::LanaiInstrInfo(const LanaiSubtarget &STI) - : LanaiGenInstrInfo(STI, Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP), + : LanaiGenInstrInfo(STI, RegisterInfo, Lanai::ADJCALLSTACKDOWN, + Lanai::ADJCALLSTACKUP), RegisterInfo() {} void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 9a35df2f240c4..5eb3bd6d01a64 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -26,9 +26,9 @@ using namespace llvm; #include "LoongArchGenInstrInfo.inc" LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI) - : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN, + : LoongArchGenInstrInfo(STI, RegInfo, LoongArch::ADJCALLSTACKDOWN, LoongArch::ADJCALLSTACKUP), - STI(STI) {} + RegInfo(STI.getHwMode()), STI(STI) {} MCInst LoongArchInstrInfo::getNop() const { return MCInstBuilder(LoongArch::ANDI) diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index e61314c034bdb..d43d229256c13 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -24,9 +24,13 @@ namespace llvm { class LoongArchSubtarget; class LoongArchInstrInfo : public LoongArchGenInstrInfo { + const LoongArchRegisterInfo RegInfo; + public: explicit LoongArchInstrInfo(const LoongArchSubtarget &STI); + const LoongArchRegisterInfo &getRegisterInfo() const { return RegInfo; } + MCInst getNop() const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp index 3acbe4992273a..76a8ba1c90e50 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp @@ -95,4 +95,4 @@ LoongArchSubtarget::LoongArchSubtarget(const Triple &TT, StringRef CPU, : LoongArchGenSubtargetInfo(TT, CPU, TuneCPU, FS), FrameLowering( initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)), - InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {} + InstrInfo(*this), TLInfo(TM, *this) {} diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h index 5e12bafebb0d5..2beff07949daf 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h @@ -45,7 +45,6 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown; LoongArchFrameLowering FrameLowering; LoongArchInstrInfo InstrInfo; - LoongArchRegisterInfo RegInfo; LoongArchTargetLowering TLInfo; SelectionDAGTargetInfo TSInfo; @@ -78,7 +77,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { } const LoongArchInstrInfo *getInstrInfo() const override { return &InstrInfo; } const LoongArchRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const LoongArchTargetLowering *getTargetLowering() const override { return &TLInfo; diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp index 65b4820752c94..af053b8645b9b 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -26,7 +26,8 @@ using namespace llvm; void MSP430InstrInfo::anchor() {} MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI) - : MSP430GenInstrInfo(STI, MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP), + : MSP430GenInstrInfo(STI, RI, MSP430::ADJCALLSTACKDOWN, + MSP430::ADJCALLSTACKUP), RI() {} void MSP430InstrInfo::storeRegToStackSlot( diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp index aa94f54cdf9a0..69b96cf3fc933 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp @@ -37,11 +37,7 @@ using namespace llvm; #define DEBUG_TYPE "mips16-instrinfo" Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {} - -const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const { - return RI; -} + : MipsInstrInfo(STI, RI, Mips::Bimm16), RI(STI) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.h b/llvm/lib/Target/Mips/Mips16InstrInfo.h index 1058e8c25fb5b..2834fd3d7eec2 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.h +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.h @@ -30,7 +30,7 @@ class Mips16InstrInfo : public MipsInstrInfo { public: explicit Mips16InstrInfo(const MipsSubtarget &STI); - const MipsRegisterInfo &getRegisterInfo() const override; + const Mips16RegisterInfo &getRegisterInfo() const { return RI; } /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp index bffdffa4af6a0..c879c46e49dd4 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -39,8 +39,9 @@ using namespace llvm; // Pin the vtable to this file. void MipsInstrInfo::anchor() {} -MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr) - : MipsGenInstrInfo(STI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), +MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, + const MipsRegisterInfo &RI, unsigned UncondBr) + : MipsGenInstrInfo(STI, RI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), Subtarget(STI), UncondBrOpc(UncondBr) {} const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) { diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h index 2337ae7c079e7..fc9424808756a 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -55,7 +55,8 @@ class MipsInstrInfo : public MipsGenInstrInfo { BT_Indirect // One indirct branch. }; - explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc); + explicit MipsInstrInfo(const MipsSubtarget &STI, const MipsRegisterInfo &RI, + unsigned UncondBrOpc); MCInst getNop() const override; @@ -130,7 +131,10 @@ class MipsInstrInfo : public MipsGenInstrInfo { /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). - virtual const MipsRegisterInfo &getRegisterInfo() const = 0; + const MipsRegisterInfo &getRegisterInfo() const { + return static_cast( + TargetInstrInfo::getRegisterInfo()); + } virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0; diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp index dbdbb179a583d..517f489984c27 100644 --- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -28,11 +28,7 @@ static unsigned getUnconditionalBranch(const MipsSubtarget &STI) { } MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, getUnconditionalBranch(STI)), RI(STI) {} - -const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const { - return RI; -} + : MipsInstrInfo(STI, RI, getUnconditionalBranch(STI)), RI(STI) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/llvm/lib/Target/Mips/MipsSEInstrInfo.h index 2b4f55d184b8b..0a7a0e5ac2a56 100644 --- a/llvm/lib/Target/Mips/MipsSEInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.h @@ -24,7 +24,7 @@ class MipsSEInstrInfo : public MipsInstrInfo { public: explicit MipsSEInstrInfo(const MipsSubtarget &STI); - const MipsRegisterInfo &getRegisterInfo() const override; + const MipsSERegisterInfo &getRegisterInfo() const { return RI; } /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 6840c7ae8faf4..db2d96f5ff532 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -26,7 +26,7 @@ using namespace llvm; void NVPTXInstrInfo::anchor() {} NVPTXInstrInfo::NVPTXInstrInfo(const NVPTXSubtarget &STI) - : NVPTXGenInstrInfo(STI), RegInfo() {} + : NVPTXGenInstrInfo(STI, RegInfo), RegInfo() {} void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 3014aa6bfe31e..8d9d4c7f8e128 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -89,7 +89,7 @@ static cl::opt EnableFMARegPressureReduction( void PPCInstrInfo::anchor() {} PPCInstrInfo::PPCInstrInfo(const PPCSubtarget &STI) - : PPCGenInstrInfo(STI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, + : PPCGenInstrInfo(STI, RI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, /* CatchRetOpcode */ -1, STI.isPPC64() ? PPC::BLR8 : PPC::BLR), Subtarget(STI), RI(STI.getTargetMachine()) {} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index b05956b674d18..ce8dd3ba4b7b3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -82,8 +82,9 @@ namespace llvm::RISCV { } // end namespace llvm::RISCV RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI) - : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP), - STI(STI) {} + : RISCVGenInstrInfo(STI, RegInfo, RISCV::ADJCALLSTACKDOWN, + RISCV::ADJCALLSTACKUP), + RegInfo(STI.getHwMode()), STI(STI) {} #define GET_INSTRINFO_HELPERS #include "RISCVGenInstrInfo.inc" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index c5eddb9e90fbf..800af2621b7df 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -79,10 +79,13 @@ enum RISCVMachineCombinerPattern : unsigned { }; class RISCVInstrInfo : public RISCVGenInstrInfo { + const RISCVRegisterInfo RegInfo; public: explicit RISCVInstrInfo(const RISCVSubtarget &STI); + const RISCVRegisterInfo &getRegisterInfo() const { return RegInfo; } + MCInst getNop() const override; Register isLoadFromStackSlot(const MachineInstr &MI, diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 715ac4cedc649..3b43be3d15743 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -104,7 +104,7 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, RVVVectorBitsMin(RVVVectorBitsMin), RVVVectorBitsMax(RVVVectorBitsMax), FrameLowering( initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)), - InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) { + InstrInfo(*this), TLInfo(TM, *this) { TSInfo = std::make_unique(); } diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4b4fc8f0d8e76..d5ffa6c812cec 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -112,7 +112,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { RISCVFrameLowering FrameLowering; RISCVInstrInfo InstrInfo; - RISCVRegisterInfo RegInfo; RISCVTargetLowering TLInfo; /// Initializes using the passed in CPU and feature strings so that we can @@ -140,7 +139,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { } const RISCVInstrInfo *getInstrInfo() const override { return &InstrInfo; } const RISCVRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const RISCVTargetLowering *getTargetLowering() const override { return &TLInfo; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp index ba95ad822df75..4f8bf4312a380 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; SPIRVInstrInfo::SPIRVInstrInfo(const SPIRVSubtarget &STI) - : SPIRVGenInstrInfo(STI) {} + : SPIRVGenInstrInfo(STI, RI) {} bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const { switch (MI.getOpcode()) { diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index f66eb9dbee2dc..fcd6cd7f262bc 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -38,8 +38,8 @@ static cl::opt void SparcInstrInfo::anchor() {} SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST) - : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(ST), - Subtarget(ST) {} + : SparcGenInstrInfo(ST, RI, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), + RI(ST), Subtarget(ST) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 2e21f27c9032f..23e7e7e314033 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -60,7 +60,7 @@ static uint64_t allOnes(unsigned int Count) { void SystemZInstrInfo::anchor() {} SystemZInstrInfo::SystemZInstrInfo(const SystemZSubtarget &sti) - : SystemZGenInstrInfo(sti, -1, -1), + : SystemZGenInstrInfo(sti, RI, -1, -1), RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(), sti.getHwMode()), STI(sti) {} diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index d5e804afd27fe..bae703bd97ac8 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -35,7 +35,7 @@ using namespace llvm; void VEInstrInfo::anchor() {} VEInstrInfo::VEInstrInfo(const VESubtarget &ST) - : VEGenInstrInfo(ST, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} + : VEGenInstrInfo(ST, RI, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 343d90e88950f..8b4e4fbbbd1e5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -34,7 +34,7 @@ using namespace llvm; #include "WebAssemblyGenInstrInfo.inc" WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) - : WebAssemblyGenInstrInfo(STI, WebAssembly::ADJCALLSTACKDOWN, + : WebAssemblyGenInstrInfo(STI, RI, WebAssembly::ADJCALLSTACKDOWN, WebAssembly::ADJCALLSTACKUP, WebAssembly::CATCHRET), RI(STI.getTargetTriple()) {} diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b2a7a4ec3583..2c6d1af225b6b 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -85,7 +85,7 @@ static cl::opt UndefRegClearance( void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) - : X86GenInstrInfo(STI, + : X86GenInstrInfo(STI, RI, (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 868f41375b96b..4f5aadca361fe 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -419,6 +419,8 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } + bool enableTerminalRule() const override { return true; } + bool enableEarlyIfConversion() const override; void getPostRAMutations(std::vector> diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index 1a9133aad4dd3..80fda3480aa44 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -43,7 +43,7 @@ namespace XCore { void XCoreInstrInfo::anchor() {} XCoreInstrInfo::XCoreInstrInfo(const XCoreSubtarget &ST) - : XCoreGenInstrInfo(ST, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), + : XCoreGenInstrInfo(ST, RI, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), RI() {} static bool isZeroImm(const MachineOperand &op) { diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index be69cefb5b78f..6bbebdecd9988 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -48,7 +48,8 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) { } XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI) - : XtensaGenInstrInfo(STI, Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP), + : XtensaGenInstrInfo(STI, RI, Xtensa::ADJCALLSTACKDOWN, + Xtensa::ADJCALLSTACKUP), RI(STI), STI(STI) {} Register XtensaInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5e4303a4c5fff..90696ffc3aca7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -99,20 +99,20 @@ VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) VPValue::~VPValue() { assert(Users.empty() && "trying to delete a VPValue with remaining users"); - if (Def) + if (VPDef *Def = getDefiningRecipe()) Def->removeDefinedValue(this); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { - if (const VPRecipeBase *R = dyn_cast_or_null(Def)) + if (const VPRecipeBase *R = getDefiningRecipe()) R->print(OS, "", SlotTracker); else printAsOperand(OS, SlotTracker); } void VPValue::dump() const { - const VPRecipeBase *Instr = dyn_cast_or_null(this->Def); + const VPRecipeBase *Instr = getDefiningRecipe(); VPSlotTracker SlotTracker( (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); print(dbgs(), SlotTracker); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f5bef08fafcdc..eab642678702a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4147,13 +4147,13 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { /// is defined at \p Idx of a load interleave group. static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx) { - auto *DefR = OpV->getDefiningRecipe(); - if (!DefR) - return WideMember0->getOperand(OpIdx) == OpV; - if (auto *W = dyn_cast(DefR)) - return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV; - - if (auto *IR = dyn_cast(DefR)) + VPValue *Member0Op = WideMember0->getOperand(OpIdx); + VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe(); + if (!Member0OpR) + return Member0Op == OpV; + if (auto *W = dyn_cast(Member0OpR)) + return !W->getMask() && Member0Op == OpV; + if (auto *IR = dyn_cast(Member0OpR)) return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV; return false; } diff --git a/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir new file mode 100644 index 0000000000000..8c31e7c2d1cec --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s | FileCheck %s +--- | + define i32 @test01() nounwind { + entry: + %0 = select i1 true, i32 1, i32 0 + %1 = and i32 %0, 65535 + %2 = icmp sgt i32 %1, 0 + br i1 %2, label %if.then, label %if.end + + if.then: ; preds = %entry + ret i32 1 + + if.end: ; preds = %entry + ret i32 0 + } +... +--- +name: test01 +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: gpr32common } +body: | + ; CHECK-LABEL: name: test01 + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1 + ; CHECK-NEXT: [[ANDSWri:%[0-9]+]]:gpr32common = ANDSWri killed [[ANDSWri]], 15, implicit-def $nzcv + ; CHECK-NEXT: Bcc 12, %bb.2, implicit $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.if.then: + ; CHECK-NEXT: $w0 = MOVi32imm 1 + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.end: + ; CHECK-NEXT: $w0 = MOVi32imm 0 + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + bb.0.entry: + successors: %bb.2.if.end, %bb.1.if.then + + %0 = MOVi32imm 1 + %1 = ANDWri killed %1, 15 + $wzr = SUBSWri killed %1, 0, 0, implicit-def $nzcv + Bcc 12, %bb.2.if.end, implicit $nzcv + + bb.1.if.then: + $w0 = MOVi32imm 1 + RET_ReallyLR implicit $w0 + + bb.2.if.end: + $w0 = MOVi32imm 0 + RET_ReallyLR implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll index dbbfbea9176f6..f725c19081deb 100644 --- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -188,11 +188,11 @@ entry: define <8 x i8> @test11(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test11: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v1.8b }, [x0] -; CHECK-NEXT: ld1r { v2.8b }, [x1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.h[2], v2.h[0] -; CHECK-NEXT: mov v0.h[3], v1.h[0] +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: fmov d2, d0 +; CHECK-NEXT: mov v0.h[2], v1.h[0] +; CHECK-NEXT: mov v0.h[3], v2.h[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll index 33c5ba7987974..8297fa2d4e3f9 100644 --- a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll +++ b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll @@ -161,6 +161,338 @@ define i1 @lt64_u16_and_23(i64 %0) { ret i1 %3 } +define i1 @test_disjoint(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp eq i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp sgt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp slt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint4(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp sle i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_4(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint_inverse_4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bic w8, w9, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp sle i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp eq i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint2_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp sgt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint3_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp slt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp eq i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint2_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp sgt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint3_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp slt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint4_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint4_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: and x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp sle i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_4_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_inverse_4_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bic x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp sle i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp eq i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint2_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp sgt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint3_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp slt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + ; negative test define i1 @lt3_u8(i8 %0) { ; CHECK-LABEL: lt3_u8: diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll index 3230c9e946da7..b3a7ec961b736 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll @@ -20,20 +20,17 @@ define i32 @sink_load_and_copy(i32 %n) { ; CHECK-NEXT: b.lt .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: adrp x8, A -; CHECK-NEXT: mov w20, w19 -; CHECK-NEXT: ldr w21, [x8, :lo12:A] +; CHECK-NEXT: mov w21, w19 +; CHECK-NEXT: ldr w20, [x8, :lo12:A] ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov w0, w21 +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w20, w20, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB0_2 -; CHECK-NEXT: b .LBB0_4 -; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: mov w20, w19 -; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup -; CHECK-NEXT: mov w0, w20 +; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -82,15 +79,12 @@ define i32 @cant_sink_successive_call(i32 %n) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w21, w21, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB1_2 -; CHECK-NEXT: b .LBB1_4 -; CHECK-NEXT: .LBB1_3: -; CHECK-NEXT: mov w21, w19 -; CHECK-NEXT: .LBB1_4: // %for.cond.cleanup +; CHECK-NEXT: .LBB1_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -139,15 +133,12 @@ define i32 @cant_sink_successive_store(ptr nocapture readnone %store, i32 %n) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w21, w21, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB2_2 -; CHECK-NEXT: b .LBB2_4 -; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: mov w21, w19 -; CHECK-NEXT: .LBB2_4: // %for.cond.cleanup +; CHECK-NEXT: .LBB2_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll index e7e109170d6a1..338084295fc7f 100644 --- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll +++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll @@ -16,13 +16,12 @@ define i32 @test(ptr %ptr) { ; CHECK-NEXT: mov w9, wzr ; CHECK-NEXT: LBB0_1: ; %.thread ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lsr w11, w9, #1 ; CHECK-NEXT: sub w10, w9, #1 -; CHECK-NEXT: mov w9, w11 +; CHECK-NEXT: lsr w9, w9, #1 ; CHECK-NEXT: tbnz w10, #0, LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb343 ; CHECK-NEXT: and w9, w10, #0x1 -; CHECK-NEXT: mov w0, #-1 +; CHECK-NEXT: mov w0, #-1 ; =0xffffffff ; CHECK-NEXT: str w9, [x8] ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index b947c943ba448..72f6646930624 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -151,12 +151,11 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_f16 @@ -190,12 +189,11 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f32 @@ -229,12 +227,11 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_f64 @@ -273,12 +270,11 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 @@ -313,12 +309,11 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 @@ -353,12 +348,11 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 @@ -393,12 +387,11 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 @@ -433,12 +426,11 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 @@ -513,12 +505,11 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 @@ -557,12 +548,11 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 @@ -596,12 +586,11 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 @@ -635,12 +624,11 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 @@ -674,12 +662,11 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 @@ -713,12 +700,11 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 @@ -752,12 +738,11 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8bf16 @@ -791,12 +776,11 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 @@ -830,12 +814,11 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index f2163ad15bafc..df88f37195ed6 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -129,12 +129,11 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: mrs x19, SVCR -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll index 6c6a691760af3..52a77cb396909 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll @@ -147,15 +147,15 @@ define <2 x float> @extract_v2f32_nxv16f32_2( %arg) { define <4 x i1> @extract_v4i1_nxv32i1_0( %arg) { ; CHECK-LABEL: extract_v4i1_nxv32i1_0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.b[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: umov w8, v1.b[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: umov w8, v1.b[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1( %arg, i64 0) ret <4 x i1> %ext diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index e10313773c73e..72994100b2970 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -248,15 +248,15 @@ define <2 x i1> @extract_v2i1_nxv2i1( %inmask) { define <4 x i1> @extract_v4i1_nxv4i1( %inmask) { ; CHECK-LABEL: extract_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1( %inmask, i64 0) ret <4 x i1> %mask @@ -265,23 +265,23 @@ define <4 x i1> @extract_v4i1_nxv4i1( %inmask) { define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { ; CHECK-LABEL: extract_v8i1_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.b[2], w8 ; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v0.b[2], w9 -; CHECK-NEXT: umov w9, v1.h[4] ; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.h[4] +; CHECK-NEXT: mov v0.b[4], w8 ; CHECK-NEXT: umov w8, v1.h[5] -; CHECK-NEXT: mov v0.b[4], w9 -; CHECK-NEXT: umov w9, v1.h[6] ; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.h[6] +; CHECK-NEXT: mov v0.b[6], w8 ; CHECK-NEXT: umov w8, v1.h[7] -; CHECK-NEXT: mov v0.b[6], w9 ; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( %inmask, i64 0) ret <8 x i1> %mask @@ -292,9 +292,9 @@ define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { define <16 x i1> @extract_v16i1_nxv16i1( %inmask) { ; CHECK-LABEL: extract_v16i1_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.b[1], v1.b[1] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v0.b[1], v0.b[1] ; CHECK-NEXT: mov v0.b[2], v1.b[2] ; CHECK-NEXT: mov v0.b[3], v1.b[3] ; CHECK-NEXT: mov v0.b[4], v1.b[4] @@ -309,6 +309,7 @@ define <16 x i1> @extract_v16i1_nxv16i1( %inmask) { ; CHECK-NEXT: mov v0.b[13], v1.b[13] ; CHECK-NEXT: mov v0.b[14], v1.b[14] ; CHECK-NEXT: mov v0.b[15], v1.b[15] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %mask = call <16 x i1> @llvm.vector.extract.v16i1.nxv16i1( %inmask, i64 0) ret <16 x i1> %mask diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll index 41e4a38fad90b..8e807cda7166d 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll @@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i1> @reshuffle_v4i1_nxv4i1( %a) #0 { ; CHECK-LABEL: reshuffle_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %el0 = extractelement %a, i32 0 %el1 = extractelement %a, i32 1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index ba4a3a2042305..bd8f432579a08 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -28,53 +28,53 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK: // %bb.0: ; CHECK-NEXT: tbnz w1, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %vector.body +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: umov w8, v0.b[8] -; CHECK-NEXT: mov v1.b[1], v0.b[1] -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #16 +; CHECK-NEXT: umov w8, v2.b[8] +; CHECK-NEXT: mov v0.b[1], v2.b[1] +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #16 ; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v1.b[2], v0.b[2] -; CHECK-NEXT: mov v2.b[1], v0.b[9] -; CHECK-NEXT: mov v1.b[3], v0.b[3] -; CHECK-NEXT: mov v2.b[2], v0.b[10] -; CHECK-NEXT: mov v1.b[4], v0.b[4] -; CHECK-NEXT: mov v2.b[3], v0.b[11] -; CHECK-NEXT: mov v1.b[5], v0.b[5] -; CHECK-NEXT: mov v2.b[4], v0.b[12] -; CHECK-NEXT: mov v1.b[6], v0.b[6] -; CHECK-NEXT: mov v2.b[5], v0.b[13] -; CHECK-NEXT: mov v1.b[7], v0.b[7] -; CHECK-NEXT: mov v2.b[6], v0.b[14] -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: mov v2.b[7], v0.b[15] -; CHECK-NEXT: uunpklo z0.h, z3.b +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.b[2], v2.b[2] +; CHECK-NEXT: mov v1.b[1], v2.b[9] +; CHECK-NEXT: mov v0.b[3], v2.b[3] +; CHECK-NEXT: mov v1.b[2], v2.b[10] +; CHECK-NEXT: mov v0.b[4], v2.b[4] +; CHECK-NEXT: mov v1.b[3], v2.b[11] +; CHECK-NEXT: mov v0.b[5], v2.b[5] +; CHECK-NEXT: mov v1.b[4], v2.b[12] +; CHECK-NEXT: mov v0.b[6], v2.b[6] +; CHECK-NEXT: mov v1.b[5], v2.b[13] +; CHECK-NEXT: mov v0.b[7], v2.b[7] +; CHECK-NEXT: mov v1.b[6], v2.b[14] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: mov v1.b[7], v2.b[15] +; CHECK-NEXT: uunpklo z2.h, z3.b ; CHECK-NEXT: uunpklo z3.h, z4.b -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: lsl z2.s, z2.s, #31 ; CHECK-NEXT: lsl z3.s, z3.s, #31 -; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: lsl z2.s, z2.s, #31 -; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0 -; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: cmpne p3.s, p0/z, z0.s, #0 +; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, #0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 -; CHECK-NEXT: asr z2.s, z2.s, #31 -; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 -; CHECK-NEXT: st1w { z0.s }, p1, [x0, #2, mul vl] -; CHECK-NEXT: st1w { z0.s }, p2, [x0, #3, mul vl] -; CHECK-NEXT: st1w { z0.s }, p3, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x0, #1, mul vl] +; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: st1w { z2.s }, p1, [x0, #2, mul vl] +; CHECK-NEXT: st1w { z2.s }, p2, [x0, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p3, [x0] +; CHECK-NEXT: st1w { z2.s }, p0, [x0, #1, mul vl] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll index 124f81e7864d1..39fe92aae0619 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll @@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) { ; CHECK-NEXT: whilelt p0.s, wzr, w0 ; CHECK-NEXT: b.pl .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: cntw x9 +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: cntw x8 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: whilelt p0.s, w8, w0 -; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: whilelt p0.s, w9, w0 +; CHECK-NEXT: add w9, w9, w8 ; CHECK-NEXT: b.mi .LBB0_2 ; CHECK-NEXT: .LBB0_3: // %exit ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 74a717f1635a3..935189dec48ac 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2835,11 +2835,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB24_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -2847,11 +2847,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB24_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -2950,26 +2950,26 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB25_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x0] -; CHECK-BE-NEXT: add x9, x1, #48 -; CHECK-BE-NEXT: add x8, x1, #32 -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] +; CHECK-BE-NEXT: add x10, x1, #48 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x1] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] ; CHECK-BE-NEXT: add x1, x1, #16 -; CHECK-BE-NEXT: ld1 { v20.4s }, [x8] +; CHECK-BE-NEXT: ld1 { v20.4s }, [x9] ; CHECK-BE-NEXT: ld1 { v22.4s }, [x1] -; CHECK-BE-NEXT: add x8, x0, #96 +; CHECK-BE-NEXT: add x9, x0, #96 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v2.16b ; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b ; CHECK-BE-NEXT: ext v24.16b, v18.16b, v18.16b, #8 -; CHECK-BE-NEXT: add x9, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ext v25.16b, v20.16b, v20.16b, #8 -; CHECK-BE-NEXT: add x10, x0, #16 +; CHECK-BE-NEXT: add x10, x0, #32 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v5.8b, v5.8b +; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v21.8b, v7.8b ; CHECK-BE-NEXT: rev32 v23.8b, v4.8b ; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8 @@ -2986,22 +2986,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s ; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: st1 { v5.2d }, [x9] ; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s ; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s -; CHECK-BE-NEXT: add x8, x0, #112 +; CHECK-BE-NEXT: add x9, x0, #112 ; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s -; CHECK-BE-NEXT: st1 { v18.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #80 +; CHECK-BE-NEXT: st1 { v18.2d }, [x10] +; CHECK-BE-NEXT: add x10, x0, #80 ; CHECK-BE-NEXT: st1 { v22.2d }, [x0] -; CHECK-BE-NEXT: st1 { v17.2d }, [x8] -; CHECK-BE-NEXT: add x8, x0, #64 -; CHECK-BE-NEXT: st1 { v19.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: mov x0, x8 -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: add x0, x0, #64 +; CHECK-BE-NEXT: st1 { v17.2d }, [x9] +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: st1 { v19.2d }, [x10] +; CHECK-BE-NEXT: st1 { v5.2d }, [x0] ; CHECK-BE-NEXT: st1 { v6.2d }, [x9] -; CHECK-BE-NEXT: st1 { v4.2d }, [x10] +; CHECK-BE-NEXT: st1 { v4.2d }, [x8] ; CHECK-BE-NEXT: b.ne .LBB25_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3093,13 +3093,14 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB26_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 +; CHECK-BE-NEXT: add x9, x0, #32 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x0] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #16 -; CHECK-BE-NEXT: ld1 { v17.4s }, [x8] -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] -; CHECK-BE-NEXT: ld1 { v19.4s }, [x10] +; CHECK-BE-NEXT: add x10, x0, #48 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: ld1 { v17.4s }, [x9] +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] +; CHECK-BE-NEXT: ld1 { v19.4s }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v3.16b @@ -3113,11 +3114,10 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: mul v6.4s, v17.4s, v6.4s ; CHECK-BE-NEXT: mul v7.4s, v18.4s, v7.4s ; CHECK-BE-NEXT: mul v4.4s, v19.4s, v4.4s -; CHECK-BE-NEXT: st1 { v5.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x10 -; CHECK-BE-NEXT: st1 { v6.4s }, [x8] -; CHECK-BE-NEXT: st1 { v7.4s }, [x9] -; CHECK-BE-NEXT: st1 { v4.4s }, [x10] +; CHECK-BE-NEXT: st1 { v5.4s }, [x8] +; CHECK-BE-NEXT: st1 { v6.4s }, [x9] +; CHECK-BE-NEXT: st1 { v7.4s }, [x10] +; CHECK-BE-NEXT: st1 { v4.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB26_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3246,11 +3246,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB28_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -3258,11 +3258,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: smull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: smull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: smull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB28_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 30a78648c186a..39618b05e6c71 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -368,7 +368,10 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) { define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) { ; GCN-LABEL: test_clamp_bf16_folding: ; GCN: ; %bb.0: -; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp +; GCN-NEXT: v_exp_bf16_e32 v0, v0 +; GCN-NEXT: v_nop +; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp ; GCN-NEXT: ; return to shader part epilog %exp = call bfloat @llvm.exp2.bf16(bfloat %src) %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0) diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll new file mode 100644 index 0000000000000..5aa146ce87708 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_kernel void @entry_fn() { +; CHECK-LABEL: entry_fn: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_sext_i32_i16 s5, s5 +; CHECK-NEXT: s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+8 +; CHECK-NEXT: s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+16 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_endpgm +entry: + call void @entry_fn() + ret void +} + +define void @caller() { +; CHECK-LABEL: caller: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_mov_b32 s0, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: v_writelane_b32 v40, s0, 2 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: s_add_co_i32 s32, s32, 16 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_mov_b64 s[0:1], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_sext_i32_i16 s5, s5 +; CHECK-NEXT: s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+12 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+24 +; CHECK-NEXT: v_mov_b32_e32 v0, v31 +; CHECK-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[2:3], s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], s[10:11] +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s0, v40, 2 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: s_mov_b32 s33, s0 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + call void @entry_fn() + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll new file mode 100644 index 0000000000000..22fba8c1d5f8c --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll @@ -0,0 +1,59 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for indexed types in dynamically indexed arrays in cbuffers. +; +; struct S { +; float x[2]; +; uint q; +; }; +; cbuffer CB : register(b0) { +; uint32_t3 w[3]; // offset 0, size 12 (+4) * 3 +; S v[3]; // offset 48, size 24 (+8) * 3 +; } +%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }> +%__cblayout_CB = type <{ + <{ + [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], + <3 x i32> + }>, + target("dx.Padding", 4), + <{ + [2 x <{ %S, target("dx.Padding", 8) }>], %S + }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; w[2].z + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: store i32 [[X]], ptr %dst + %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_ptr, i32 40 + %w_load = load i32, ptr addrspace(2) %w_gep, align 4 + store i32 %w_load, ptr %dst, align 4 + + ;; v[2].q + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_ptr, i32 84 + %v_load = load i32, ptr addrspace(2) %v_gep, align 4 + %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %v_load, ptr %v.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll new file mode 100644 index 0000000000000..615fc5ea07eca --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Test for when we have indices into both the array and the vector: ie, s[1][3] + +; cbuffer CB : register(b0) { +; uint4 s[3]; // offset 0, size 16 * 3 +; } +%__cblayout_CB = type <{ [2 x <4 x i32>] }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; s[1][3] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: store i32 [[X]], ptr %dst + %i8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %i8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %i8_ptr, i32 28 + %i8_vecext = load i32, ptr addrspace(2) %i8_gep, align 4 + store i32 %i8_vecext, ptr %dst, align 4 + + ;; s[2].w + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ;; + ;; It would be nice to avoid the redundant vector creation here, but that's + ;; outside of the scope of this pass. + ;; + ; CHECK: [[X_VEC:%.*]] = insertelement <4 x i32> {{%.*}}, i32 [[X]], i32 3 + ; CHECK: [[X_EXT:%.*]] = extractelement <4 x i32> [[X_VEC]], i32 3 + ; CHECK: store i32 [[X_EXT]], ptr %dst + %typed_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %typed_gep = getelementptr <4 x i32>, ptr addrspace(2) %typed_ptr, i32 2 + %typed_load = load <4 x i32>, ptr addrspace(2) %typed_gep, align 16 + %typed_vecext = extractelement <4 x i32> %typed_load, i32 3 + store i32 %typed_vecext, ptr %dst, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll new file mode 100644 index 0000000000000..eabc07c2fbb68 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll @@ -0,0 +1,30 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; +; } +%__cblayout_CB = type <{ [2 x <{ float, [12 x i8] }>], float }> + +@CB.cb = global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h = call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ;; a1[1] + ;; Note that the valid GEPs of a1 are `0, 0, 0`, `0, 0, 1`, and `0, 1`. + ; + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1_gep = getelementptr inbounds <{ [2 x <{ float, [12 x i8] }>], float }>, ptr addrspace(2) %a1_ptr, i32 0, i32 0, i32 1 + %a1 = load float, ptr addrspace(2) %a1_gep, align 4 + store float %a1, ptr %dst, align 32 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll new file mode 100644 index 0000000000000..6f6166e820a6f --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll @@ -0,0 +1,145 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; // offset 0, size 4 (+12) * 3 +; double3 a2[2]; // offset 48, size 24 (+8) * 2 +; float16_t a3[2][2]; // offset 112, size 2 (+14) * 4 +; uint64_t a4[3]; // offset 176, size 8 (+8) * 3 +; int4 a5[2][3][4]; // offset 224, size 16 * 24 +; uint16_t a6[1]; // offset 608, size 2 (+14) * 1 +; int64_t a7[2]; // offset 624, size 8 (+8) * 2 +; bool a8[4]; // offset 656, size 4 (+12) * 4 +; } +%__cblayout_CB = type <{ + <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>, target("dx.Padding", 12), + <{ [1 x <{ <3 x double>, target("dx.Padding", 8) }>], <3 x double> }>, target("dx.Padding", 8), + <{ [3 x <{ half, target("dx.Padding", 14) }>], half }>, target("dx.Padding", 14), + <{ [2 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8), + [24 x <4 x i32>], + [1 x i16], target("dx.Padding", 14), + <{ [1 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8), + <{ [3 x <{ i32, target("dx.Padding", 12) }>], i32 }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; a1[1] + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a1_ptr, i32 16 + %a1 = load float, ptr addrspace(2) %a1_gep, align 4 + store float %a1, ptr %dst, align 32 + + ;; a2[1] + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %a2_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a2_ptr, i32 32 + %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 8 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store <3 x double> %a2, ptr %a2.i, align 32 + + ;; a3[0][1] + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 32 + ; CHECK: store half [[X]], ptr [[PTR]] + %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 112) + %a3_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a3_ptr, i32 16 + %a3 = load half, ptr addrspace(2) %a3_gep, align 2 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 32 + store half %a3, ptr %a3.i, align 2 + + ;; a4[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 12) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 176) + %a4_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a4_ptr, i32 16 + %a4 = load i64, ptr addrspace(2) %a4_gep, align 8 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store i64 %a4, ptr %a4.i, align 8 + + ;; a5[1][0][0] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 26) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 224) + %a5_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a5_ptr, i32 192 + %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 4 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ;; a6[0] + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 38) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 64 + ; CHECK: store i16 [[X]], ptr [[PTR]] + %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 608) + %a6 = load i16, ptr addrspace(2) %a6_ptr, align 2 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 64 + store i16 %a6, ptr %a6.i, align 2 + + ;; a7[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 40) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 624) + %a7_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a7_ptr, i32 16 + %a7 = load i64, ptr addrspace(2) %a7_gep, align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store i64 %a7, ptr %a7.i, align 8 + + ;; a8[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 42) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 80 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %a8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 656) + %a8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a8_ptr, i32 16 + %a8 = load i32, ptr addrspace(2) %a8_gep, align 4, !range !0, !noundef !1 + %a8.i = getelementptr inbounds nuw i8, ptr %dst, i32 80 + store i32 %a8, ptr %a8.i, align 4 + + ret void +} + +!0 = !{i32 0, i32 2} +!1 = !{} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll new file mode 100644 index 0000000000000..22994cfc3f48a --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll @@ -0,0 +1,64 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for indexed types in dynamically indexed arrays in cbuffers. +; +; Bug https://github.com/llvm/llvm-project/issues/164517 +; XFAIL: * +; +; struct S { +; float x[2]; +; uint q; +; }; +; cbuffer CB : register(b0) { +; uint32_t3 w[3]; // offset 0, size 12 (+4) * 3 +; S v[3]; // offset 48, size 24 (+8) * 3 +; } +%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }> +%__cblayout_CB = type <{ + <{ + [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], + <3 x i32> + }>, + target("dx.Padding", 4), + <{ + [2 x <{ %S, target("dx.Padding", 8) }>], %S + }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; w[idx].z + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: store i32 [[X]], ptr %dst + %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %w_arrayidx = getelementptr <3 x i32>, ptr addrspace(2) %w_ptr, i32 %idx + %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_arrayidx, i32 4 + %w_load = load i32, ptr addrspace(2) %w_gep, align 4 + store i32 %w_load, ptr %dst, align 4 + + ;; v[idx].q + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %v_arrayidx = getelementptr <{ %struct.S, target("dx.Padding", 4) }>, ptr addrspace(2) %v_ptr, i32 %idx + %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_arrayidx, i32 8 + %v_load = load i32, ptr addrspace(2) %v_gep, align 4 + %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %v_load, ptr %v.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll new file mode 100644 index 0000000000000..7daebaed70442 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll @@ -0,0 +1,46 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for dynamic indices into arrays in cbuffers. + +; cbuffer CB : register(b0) { +; uint s[10]; // offset 0, size 4 (+12) * 10 +; uint t[12]; // offset 160, size 4 (+12) * 12 +; } +%__cblayout_CB = type <{ <{ [9 x <{ i32, target("dx.Padding", 12) }>], i32 }>, target("dx.Padding", 12), <{ [11 x <{ i32, target("dx.Padding", 12) }>], i32 }> }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; s[idx] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: store i32 [[X]], ptr %dst + %s_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %s_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %s_ptr, i32 %idx + %s_load = load i32, ptr addrspace(2) %s_gep, align 4 + store i32 %s_load, ptr %dst, align 4 + + ;; t[idx] + ; + ; CHECK: [[T_IDX:%.*]] = add i32 10, %idx + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 [[T_IDX]]) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %t_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 160) + %t_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %t_ptr, i32 %idx + %t_load = load i32, ptr addrspace(2) %t_gep, align 4 + %t.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %t_load, ptr %t.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll new file mode 100644 index 0000000000000..65c9a3ec966e9 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll @@ -0,0 +1,101 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB { +; float a1; // offset 0, size 4 +; int a2; // offset 4, size 4 +; bool a3; // offset 8, size 4 +; float16_t a4; // offset 12, size 2 +; uint16_t a5; // offset 14, size 2 +; double a6; // offset 16, size 8 +; int64_t a7; // offset 24, size 8 +; } +%__cblayout_CB = type <{ float, i32, i32, half, i16, double, i64 }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + + ;; a1 + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A1:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[A1]], ptr %dst + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1 = load float, ptr addrspace(2) %a1_ptr, align 4 + store float %a1, ptr %dst, align 8 + + ;; a2 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A2:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[A2]], ptr [[PTR]] + %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 4) + %a2 = load i32, ptr addrspace(2) %a2_ptr, align 4 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %a2, ptr %a2.i, align 8 + + ;; a3 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A3:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store i32 [[A3]], ptr [[PTR]] + %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 8) + %a3 = load i32, ptr addrspace(2) %a3_ptr, align 4 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store i32 %a3, ptr %a3.i, align 4 + + ;; a4 + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A4:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 6 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 12 + ; CHECK: store half [[A4]], ptr [[PTR]] + %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 12) + %a4 = load half, ptr addrspace(2) %a4_ptr, align 2 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 12 + store half %a4, ptr %a4.i, align 4 + + ;; a5 + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A5:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 7 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 14 + ; CHECK: store i16 [[A5]], ptr [[PTR]] + %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 14) + %a5 = load i16, ptr addrspace(2) %a5_ptr, align 2 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 14 + store i16 %a5, ptr %a5.i, align 2 + + ;; a6 + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[A6:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store double [[A6]], ptr [[PTR]] + %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16) + %a6 = load double, ptr addrspace(2) %a6_ptr, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store double %a6, ptr %a6.i, align 8 + + ;; a7 + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[A7:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 24 + ; CHECK: store i64 [[A7]], ptr [[PTR]] + %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 24) + %a7 = load i64, ptr addrspace(2) %a7_ptr, align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 24 + store i64 %a7, ptr %a7.i, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll new file mode 100644 index 0000000000000..0156a1a0472ab --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll @@ -0,0 +1,121 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB { +; float3 a1; // offset 0, size 12 (+4) +; double3 a2; // offset 16, size 24 +; float16_t2 a3; // offset 40, size 4 (+4) +; uint64_t3 a4; // offset 48, size 24 (+8) +; int4 a5; // offset 80, size 16 +; uint16_t3 a6; // offset 96, size 6 +; }; +%__cblayout_CB = type <{ <3 x float>, target("dx.Padding", 4), <3 x double>, <2 x half>, target("dx.Padding", 4), <3 x i64>, target("dx.Padding", 8), <4 x i32>, <3 x i16> }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + + ;; a1 + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x float> poison, float [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x float> [[VEC0]], float [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x float> [[VEC1]], float [[Z]], i32 2 + ; CHECK: store <3 x float> [[VEC2]], ptr %dst + %a1_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1 = load <3 x float>, ptr addrspace(2) %a1_gep, align 16 + store <3 x float> %a1, ptr %dst, align 4 + + ;; a2 + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16) + %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 32 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store <3 x double> %a2, ptr %a2.i, align 8 + + ;; a3 + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 4 + ; CHECK: [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 5 + ; CHECK: [[VEC0:%.*]] = insertelement <2 x half> poison, half [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <2 x half> [[VEC0]], half [[Y]], i32 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store <2 x half> [[VEC1]], ptr [[PTR]] + %a3_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 40) + %a3 = load <2 x half>, ptr addrspace(2) %a3_gep, align 4 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store <2 x half> %a3, ptr %a3.i, align 2 + + ;; a4 + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 3) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 4) + ; CHECK: [[Z:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i64> poison, i64 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i64> [[VEC0]], i64 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i64> [[VEC1]], i64 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <3 x i64> [[VEC2]], ptr [[PTR]] + %a4_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %a4 = load <3 x i64>, ptr addrspace(2) %a4_gep, align 32 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <3 x i64> %a4, ptr %a4.i, align 8 + + ;; a5 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 80) + %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 16 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ;; a6 + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i16> poison, i16 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i16> [[VEC0]], i16 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i16> [[VEC1]], i16 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 88 + ; CHECK: store <3 x i16> [[VEC2]], ptr [[PTR]] + %a6_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 96) + %a6 = load <3 x i16>, ptr addrspace(2) %a6_gep, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 88 + store <3 x i16> %a6, ptr %a6.i, align 2 + + ret void +} diff --git a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll index 620b2acc49520..1c1965d44541f 100644 --- a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll +++ b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=hexagon -relocation-model=pic < %s | FileCheck %s -; RUN: llc -mtriple=hexagon < %s | FileCheck %s +; RUN: llc -mtriple=hexagon --mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp < %s | FileCheck %s ; CHECK-LABEL: test1: ; CHECK: {{call my_instrprof_handler|r0 = #999}} @@ -14,7 +14,4 @@ entry: } ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn -declare void @llvm.hexagon.instrprof.custom(ptr, i32) #1 - -attributes #0 = { "target-features"="+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp,+hmxv68" } -attributes #1 = { inaccessiblememonly nofree nosync nounwind willreturn } +declare void @llvm.hexagon.instrprof.custom(ptr, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll index 79665af17ef58..9632469261f4d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll @@ -7,22 +7,22 @@ define dso_local i32 @test_500_504(ptr nocapture readonly %x) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #126 -; CHECK-NEXT: adr r2, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: mov.w r2, #500 -; CHECK-NEXT: vdup.32 q1, r2 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: adr r1, .LCPI0_0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: mov.w r1, #500 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vqadd.u32 q2, q0, r1 -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: vqadd.u32 q2, q0, r2 +; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: vptt.u32 hi, q1, q2 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vaddvat.u32 r2, q2 +; CHECK-NEXT: vaddvat.u32 r12, q2 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll index ec257bcf123f3..bcedcd40ba112 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -28,29 +28,29 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r11, [r0, #16]! -; CHECK-NEXT: ldrd r5, r7, [r0, #-12] +; CHECK-NEXT: ldrd r5, r6, [r0, #-12] ; CHECK-NEXT: ldr r4, [r0, #-4] ; CHECK-NEXT: cmp r12, r5 ; CHECK-NEXT: csel r5, r5, r12, gt -; CHECK-NEXT: csinc r6, r10, r8, le -; CHECK-NEXT: cmp r5, r7 +; CHECK-NEXT: csinc r7, r10, r8, le +; CHECK-NEXT: cmp r5, r6 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt.w r6, r8, #2 -; CHECK-NEXT: csel r7, r7, r5, gt -; CHECK-NEXT: cmp r7, r4 +; CHECK-NEXT: addgt.w r7, r8, #2 +; CHECK-NEXT: csel r6, r6, r5, gt +; CHECK-NEXT: cmp r6, r4 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt.w r6, r8, #3 -; CHECK-NEXT: csel r7, r4, r7, gt +; CHECK-NEXT: addgt.w r7, r8, #3 +; CHECK-NEXT: csel r6, r4, r6, gt ; CHECK-NEXT: add.w r8, r8, #4 -; CHECK-NEXT: cmp r7, r11 -; CHECK-NEXT: csel r10, r8, r6, gt -; CHECK-NEXT: csel r12, r11, r7, gt +; CHECK-NEXT: cmp r6, r11 +; CHECK-NEXT: csel r10, r8, r7, gt +; CHECK-NEXT: csel r12, r11, r6, gt ; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit ; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 1769c5d2fd385..98e082be4cad1 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -21,11 +21,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: it lt ; ENABLED-NEXT: bxlt lr ; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: mov r11, r0 -; ENABLED-NEXT: ldr r0, [sp, #32] +; ENABLED-NEXT: ldr r0, [sp, #36] ; ENABLED-NEXT: add.w r9, r2, #3 ; ENABLED-NEXT: mov.w r12, #0 +; ENABLED-NEXT: mov.w r8, #1 ; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 @@ -49,18 +50,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 -; ENABLED-NEXT: movs r7, #1 -; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: sub.w r4, r2, r12 +; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: vmov.i32 q1, #0x0 -; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 +; ENABLED-NEXT: mov r7, r10 +; ENABLED-NEXT: add.w r6, r8, r0, lsr #2 ; ENABLED-NEXT: adds r0, r2, #3 ; ENABLED-NEXT: sub.w r0, r0, r12 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 -; ENABLED-NEXT: mov r7, r10 -; ENABLED-NEXT: dls lr, r0 +; ENABLED-NEXT: add.w lr, r8, r0, lsr #2 ; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -83,7 +82,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: bx lr ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: @@ -92,11 +91,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: it lt ; NOREDUCTIONS-NEXT: bxlt lr ; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: mov r11, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] +; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] ; NOREDUCTIONS-NEXT: add.w r9, r2, #3 ; NOREDUCTIONS-NEXT: mov.w r12, #0 +; NOREDUCTIONS-NEXT: mov.w r8, #1 ; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 @@ -120,18 +120,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 -; NOREDUCTIONS-NEXT: movs r7, #1 -; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 +; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 -; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: mov r7, r10 +; NOREDUCTIONS-NEXT: add.w r6, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: adds r0, r2, #3 ; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: mov r7, r10 -; NOREDUCTIONS-NEXT: dls lr, r0 +; NOREDUCTIONS-NEXT: add.w lr, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -154,7 +152,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: bx lr entry: %conv = sext i16 %N to i32 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index cbcbf1f392ce8..435acc29f076e 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -165,74 +165,73 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" { ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: adds r6, r3, #4 -; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: add.w r9, r3, #4 +; CHECK-NEXT: add.w r10, r0, #4 ; CHECK-NEXT: mvn r8, #1 -; CHECK-NEXT: @ implicit-def: $r9 +; CHECK-NEXT: @ implicit-def: $r6 ; CHECK-NEXT: @ implicit-def: $r4 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: ldr.w r1, [r10] ; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: muls r1, r3, r1 ; CHECK-NEXT: adds r4, r4, r1 ; CHECK-NEXT: adc.w r1, r2, r1, asr #31 ; CHECK-NEXT: adds.w r2, r4, #-2147483648 -; CHECK-NEXT: ldrd r2, r4, [r8] -; CHECK-NEXT: adc r5, r1, #0 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: smull r4, r2, r4, r9 -; CHECK-NEXT: asrs r1, r5, #31 +; CHECK-NEXT: ldrd r5, r4, [r8] +; CHECK-NEXT: adc r2, r1, #0 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r1, r2 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds.w r10, r4, #-2147483648 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: ldr r4, [r2, #-4] +; CHECK-NEXT: smull r4, r5, r4, r6 +; CHECK-NEXT: asrs r1, r2, #31 +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: subs r4, r2, r4 +; CHECK-NEXT: sbcs r1, r5 +; CHECK-NEXT: adds.w r6, r4, #-2147483648 +; CHECK-NEXT: ldr r4, [r10, #-4] +; CHECK-NEXT: adc r11, r1, #0 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: add.w r10, r10, #4 ; CHECK-NEXT: muls r4, r3, r4 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: adds.w r12, r4, #-2147483648 ; CHECK-NEXT: asr.w r5, r4, #31 -; CHECK-NEXT: ldr r4, [r6] +; CHECK-NEXT: ldr.w r4, [r9] ; CHECK-NEXT: adc r5, r5, #0 ; CHECK-NEXT: mul r2, r4, r0 -; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: add.w r2, r2, #-2147483648 ; CHECK-NEXT: asrl r12, r5, r2 -; CHECK-NEXT: smull r2, r5, r4, r12 -; CHECK-NEXT: lsll r2, r5, #30 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: asr.w r11, r5, #31 -; CHECK-NEXT: mov r12, r5 -; CHECK-NEXT: lsll r12, r11, r4 -; CHECK-NEXT: mul r2, r2, r9 -; CHECK-NEXT: lsrl r12, r11, #2 -; CHECK-NEXT: adds r2, #2 -; CHECK-NEXT: lsll r12, r11, r2 +; CHECK-NEXT: smull r2, r9, r4, r12 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: lsll r2, r9, #30 +; CHECK-NEXT: asr.w r5, r9, #31 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: lsll r2, r5, r4 +; CHECK-NEXT: lsrl r2, r5, #2 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: lsll r2, r5, r0 +; CHECK-NEXT: add.w r0, r2, #-2147483648 ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r12, #-2147483648 -; CHECK-NEXT: asrl r10, r1, r5 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: lsrl r10, r1, #2 -; CHECK-NEXT: movs r1, #2 -; CHECK-NEXT: mov r9, r10 -; CHECK-NEXT: str.w r10, [r1] -; CHECK-NEXT: ldr r1, [r8], #-4 -; CHECK-NEXT: mls r5, r1, r4, r5 -; CHECK-NEXT: adds.w r4, r5, #-2147483648 -; CHECK-NEXT: asr.w r1, r5, #31 +; CHECK-NEXT: asrl r6, r11, r0 +; CHECK-NEXT: movs r0, #2 +; CHECK-NEXT: lsrl r6, r11, #2 +; CHECK-NEXT: str r6, [r0] +; CHECK-NEXT: ldr r0, [r8], #-4 +; CHECK-NEXT: mls r0, r0, r4, r1 +; CHECK-NEXT: adds.w r4, r0, #-2147483648 +; CHECK-NEXT: asr.w r1, r0, #31 ; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: lsrl r4, r1, #2 -; CHECK-NEXT: rsbs r1, r4, #0 -; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: str r1, [r6, #-4] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: rsbs r0, r4, #0 +; CHECK-NEXT: str r0, [r2] +; CHECK-NEXT: str r0, [r9, #-4] +; CHECK-NEXT: add.w r9, r9, #4 +; CHECK-NEXT: add.w r0, r12, #4 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end ; CHECK-NEXT: add sp, #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index f7b4548f127bf..b6657d607ce6d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1573,120 +1573,115 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrd r7, r9, [r0] -; CHECK-NEXT: and r6, r3, #3 -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: lsrs r3, r3, #2 -; CHECK-NEXT: @ implicit-def: $r12 -; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: ldm.w r0, {r7, r9, r11} +; CHECK-NEXT: and r0, r3, #3 +; CHECK-NEXT: @ implicit-def: $r5 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: lsrs r0, r3, #2 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: b .LBB19_3 ; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r12, r10 ; CHECK-NEXT: .LBB19_2: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: strd r2, r4, [r9] -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: strd r3, r8, [r9, #8] -; CHECK-NEXT: add.w r9, r9, #16 +; CHECK-NEXT: add.w r11, r11, #128 +; CHECK-NEXT: strd r8, r0, [r9] ; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: strd r3, r12, [r9, #8] +; CHECK-NEXT: add.w r9, r9, #16 +; CHECK-NEXT: subs r7, #1 ; CHECK-NEXT: beq.w .LBB19_13 ; CHECK-NEXT: .LBB19_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB19_5 Depth 2 -; CHECK-NEXT: ldrd r5, r11, [r9] +; CHECK-NEXT: ldr.w r10, [r9, #12] ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: ldrd r8, r10, [r9, #8] -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: ldm.w r9, {r3, r4, r12} +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: wls lr, r2, .LBB19_6 +; CHECK-NEXT: wls lr, r0, .LBB19_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: .LBB19_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r5, [r1, #12] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: ldm.w r1, {r2, r7, r11} -; CHECK-NEXT: vmul.f32 q2, q2, r5 -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vfma.f32 q2, q6, r11 -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r8, r4 +; CHECK-NEXT: ldrd r4, r3, [r1, #8] +; CHECK-NEXT: vldrw.u32 q2, [r11] +; CHECK-NEXT: vldrw.u32 q6, [r11, #16] +; CHECK-NEXT: ldrd r0, r7, [r1] +; CHECK-NEXT: vmul.f32 q2, q2, r3 +; CHECK-NEXT: vldrw.u32 q7, [r11, #32] +; CHECK-NEXT: vfma.f32 q2, q6, r4 +; CHECK-NEXT: vldrw.u32 q4, [r11, #48] ; CHECK-NEXT: vfma.f32 q2, q7, r7 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vfma.f32 q2, q4, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vfma.f32 q2, q5, r3 -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vfma.f32 q2, q3, r4 -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vfma.f32 q2, q1, r8 +; CHECK-NEXT: vldrw.u32 q5, [r11, #64] +; CHECK-NEXT: vfma.f32 q2, q4, r0 +; CHECK-NEXT: vldrw.u32 q3, [r11, #80] +; CHECK-NEXT: vfma.f32 q2, q5, r5 +; CHECK-NEXT: vldrw.u32 q1, [r11, #96] +; CHECK-NEXT: vfma.f32 q2, q3, r8 +; CHECK-NEXT: vldrw.u32 q0, [r11, #112] +; CHECK-NEXT: vfma.f32 q2, q1, r12 ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vfma.f32 q2, q0, r10 -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: vmov r10, r8, d5 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: vmov r10, r12, d5 ; CHECK-NEXT: vstrb.8 q2, [r6], #16 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: mov r12, r5 ; CHECK-NEXT: le lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq .LBB19_1 ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldrd lr, r4, [r1] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldrd r2, r1, [r1, #8] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: ldrd lr, r0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r11] +; CHECK-NEXT: ldrd r8, r1, [r1, #8] +; CHECK-NEXT: vldrw.u32 q6, [r11, #16] +; CHECK-NEXT: vldrw.u32 q7, [r11, #32] +; CHECK-NEXT: vldrw.u32 q4, [r11, #48] ; CHECK-NEXT: vmul.f32 q0, q0, r1 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vfma.f32 q0, q6, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vfma.f32 q0, q7, r4 -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vldrw.u32 q5, [r11, #64] +; CHECK-NEXT: vfma.f32 q0, q6, r8 +; CHECK-NEXT: vldrw.u32 q3, [r11, #80] +; CHECK-NEXT: vfma.f32 q0, q7, r0 +; CHECK-NEXT: vldrw.u32 q2, [r11, #96] ; CHECK-NEXT: vfma.f32 q0, q4, lr -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vfma.f32 q0, q5, r5 -; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: vfma.f32 q0, q3, r11 -; CHECK-NEXT: vfma.f32 q0, q2, r8 +; CHECK-NEXT: vldrw.u32 q1, [r11, #112] +; CHECK-NEXT: vfma.f32 q0, q5, r3 +; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: vfma.f32 q0, q3, r4 +; CHECK-NEXT: vfma.f32 q0, q2, r12 ; CHECK-NEXT: vfma.f32 q0, q1, r10 -; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: bne .LBB19_9 ; CHECK-NEXT: @ %bb.8: @ %if.then58 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: str r5, [r6] -; CHECK-NEXT: mov r2, lr -; CHECK-NEXT: mov r4, r12 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: str r4, [r6] +; CHECK-NEXT: mov r8, lr +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_9: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov r8, s1 -; CHECK-NEXT: cmp r3, #2 +; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: vstr s1, [r6, #4] -; CHECK-NEXT: str r5, [r6] +; CHECK-NEXT: str r4, [r6] ; CHECK-NEXT: bne .LBB19_11 ; CHECK-NEXT: @ %bb.10: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r4, lr -; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, lr +; CHECK-NEXT: mov r12, r4 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_11: @ %if.else64 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 @@ -1694,7 +1689,7 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: vstr s2, [r6, #8] ; CHECK-NEXT: .LBB19_12: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: b .LBB19_2 ; CHECK-NEXT: .LBB19_13: @ %do.end ; CHECK-NEXT: add sp, #16 @@ -1901,8 +1896,8 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: ldrd r6, r12, [r0, #4] ; CHECK-NEXT: lsr.w r8, r3, #1 ; CHECK-NEXT: ldrb r0, [r0] @@ -1910,11 +1905,11 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: b .LBB20_3 ; CHECK-NEXT: .LBB20_1: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vstr s12, [r6] +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vstr s4, [r6] ; CHECK-NEXT: .LBB20_2: @ %if.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vstr s14, [r6, #4] +; CHECK-NEXT: vstr s6, [r6, #4] ; CHECK-NEXT: add.w r12, r12, #20 ; CHECK-NEXT: adds r6, #8 ; CHECK-NEXT: subs r0, #1 @@ -1923,41 +1918,39 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: .LBB20_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB20_5 Depth 2 -; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: vldrw.u32 q3, [r12] ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vshlc q4, r5, #32 -; CHECK-NEXT: vldrw.u32 q1, [r12, #8] -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldrw.u32 q2, [r12, #8] +; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vshlc q5, r5, #32 -; CHECK-NEXT: vldrw.u32 q3, [r6] -; CHECK-NEXT: vmov.f32 s14, s0 +; CHECK-NEXT: vldrw.u32 q1, [r6] +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: vmov.f32 s15, s0 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: wls lr, r8, .LBB20_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: .LBB20_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB20_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrd r7, r4, [r1], #8 -; CHECK-NEXT: vfma.f32 q6, q2, r7 -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vmov q3, q6 -; CHECK-NEXT: vfma.f32 q3, q1, r7 -; CHECK-NEXT: vstr s24, [r5] -; CHECK-NEXT: vmov.f32 s15, s0 -; CHECK-NEXT: vfma.f32 q3, q4, r4 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: vstr s13, [r5, #4] -; CHECK-NEXT: vfma.f32 q3, q5, r4 +; CHECK-NEXT: vfma.f32 q1, q3, r7 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vfma.f32 q1, q2, r7 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vfma.f32 q1, q4, r4 +; CHECK-NEXT: vmov r4, s5 +; CHECK-NEXT: vstr s5, [r5, #4] +; CHECK-NEXT: vfma.f32 q1, q5, r4 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vstr s2, [r5] ; CHECK-NEXT: adds r5, #8 -; CHECK-NEXT: vmov.f32 s12, s14 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: le lr, .LBB20_5 ; CHECK-NEXT: .LBB20_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 @@ -1966,14 +1959,14 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 ; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vfma.f32 q3, q2, r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vstr s12, [r5] -; CHECK-NEXT: vfma.f32 q3, q1, r1 -; CHECK-NEXT: vstr s13, [r6] +; CHECK-NEXT: vfma.f32 q1, q3, r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vstr s4, [r5] +; CHECK-NEXT: vfma.f32 q1, q2, r1 +; CHECK-NEXT: vstr s5, [r6] ; CHECK-NEXT: b .LBB20_2 ; CHECK-NEXT: .LBB20_8: @ %do.end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.9: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 0d86f22a321e0..b60ee7c6d406b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -1313,27 +1313,29 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 ; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: .LBB16_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q1, q5, r0 +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vadd.i32 q6, q5, r0 +; CHECK-NEXT: vmov r7, r3, d13 ; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r7, r3, d3 -; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vmov r4, r10, d12 +; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: subs.w r9, r9, #16 -; CHECK-NEXT: vmov r4, r10, d2 -; CHECK-NEXT: vadd.i32 q1, q7, lr ; CHECK-NEXT: vadd.i32 q4, q4, lr ; CHECK-NEXT: vadd.i32 q5, q5, lr +; CHECK-NEXT: vadd.i32 q7, q7, lr ; CHECK-NEXT: ldrb.w r11, [r3] ; CHECK-NEXT: ldrb r3, [r7] ; CHECK-NEXT: vmov r7, r12, d4 -; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vadd.i32 q7, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: ldrb r4, [r4] @@ -1342,7 +1344,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb.w r1, [r12] ; CHECK-NEXT: vmov.8 q0[0], r7 ; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d15 +; CHECK-NEXT: vmov r1, r7, d3 ; CHECK-NEXT: vmov.8 q0[2], r5 ; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: vmov.8 q0[4], r4 @@ -1357,8 +1359,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: ldrb.w r12, [r7] ; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d14 -; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vmov r4, r7, d2 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[8], r4 @@ -1370,7 +1371,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: vmov.8 q0[14], r3 ; CHECK-NEXT: vmov.8 q0[15], r12 ; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index eedca2cd4a5d3..c0b2da7eff41b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -236,11 +236,11 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(ptr noalias nocapture r ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vadd.i32 q3, q1, q0 +; CHECK-NEXT: vldrw.u32 q3, [r0, q1, uxtw #2] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vstrw.32 q3, [r0, q2, uxtw #2] ; CHECK-NEXT: bne .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr @@ -330,20 +330,20 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r4, .LCPI7_0 ; CHECK-NEXT: mov.w r12, #9 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vadd.i32 q2, q1, r4 -; CHECK-NEXT: vmla.i32 q3, q1, lr -; CHECK-NEXT: vmul.i32 q1, q1, r12 -; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmla.i32 q3, q2, lr ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q1, [r3] -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmul.i32 q2, q2, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 +; CHECK-NEXT: vstrw.32 q2, [r3] ; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: movs r5, #18 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w r12, #9 ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vdup.32 q2, r5 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vadd.i32 q3, q2, r4 -; CHECK-NEXT: vmla.i32 q4, q2, lr +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmla.i32 q4, q3, lr ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrw.u32 q5, [q4, #24] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmla.i32 q4, q2, r12 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmla.i32 q4, q3, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 ; CHECK-NEXT: vstrb.8 q5, [r1], #16 ; CHECK-NEXT: vstrw.32 q4, [r3] ; CHECK-NEXT: bne .LBB8_1 @@ -487,21 +487,21 @@ define dso_local void @arm_mat_mult_q31(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov q7, q2 +; CHECK-NEXT: vmov q1, q2 ; CHECK-NEXT: dls lr, r10 ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vmlas.i32 q7, q0, r7 -; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmlas.i32 q1, q0, r7 +; CHECK-NEXT: vmov q7, q4 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q0, q7, q3 -; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2] -; CHECK-NEXT: vldrw.u32 q7, [q6, #32]! -; CHECK-NEXT: vmul.i32 q1, q1, q7 -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vadd.i32 q5, q1, q5 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q3 +; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [q7, #32]! +; CHECK-NEXT: vmul.i32 q0, q0, q6 +; CHECK-NEXT: vadd.i32 q5, q0, q5 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 @@ -702,12 +702,12 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q6, q5, q3 -; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] -; CHECK-NEXT: vldrh.s32 q5, [r3], #8 -; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vadd.i32 q5, q5, q3 +; CHECK-NEXT: vldrh.s32 q7, [r1, q6, uxtw #1] +; CHECK-NEXT: vldrh.s32 q6, [r3], #8 +; CHECK-NEXT: vmul.i32 q6, q7, q6 +; CHECK-NEXT: vadd.i32 q4, q6, q4 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 @@ -922,15 +922,15 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(ptr nocapture readonly ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 -; CHECK-NEXT: vldrb.s32 q2, [r0, q5] -; CHECK-NEXT: vadd.i32 q7, q5, q0 -; CHECK-NEXT: vldrb.s32 q5, [r1, q4] -; CHECK-NEXT: vadd.i32 q6, q4, q0 -; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vmov q7, q5 +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vldrb.s32 q2, [r0, q7] +; CHECK-NEXT: vldrb.s32 q7, [r1, q6] ; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: vmlava.u32 r12, q2, q5 -; CHECK-NEXT: vmov q5, q7 -; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vadd.i32 q4, q4, q0 +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vadd.i32 q5, q5, q0 +; CHECK-NEXT: vmlava.u32 r12, q2, q7 ; CHECK-NEXT: bne .LBB11_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4 diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index 43ed5eefbf4c7..d6c5cde30ed73 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll @@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: csel r7, r6, r5, hs ; CHECK-NEXT: add.w lr, r7, #1 ; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: vldrh.u16 q2, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q2 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 ; CHECK-NEXT: vldrh.u16 q1, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 +; CHECK-NEXT: vldrh.u16 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 ; CHECK-NEXT: movs r6, #14 ; CHECK-NEXT: and.w r2, r6, r2, lsl #1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q1, [r1, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q1, [r0] +; CHECK-NEXT: vldrht.u16 q2, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 +; CHECK-NEXT: vmlsldavat.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q2, q0 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 ; CHECK-NEXT: subs r2, #8 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index 94d5490cead2f..6f2a0b2debc47 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -439,17 +439,18 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) { ; CHECK-NEXT: vmovx.f16 s1, s14 ; CHECK-NEXT: vmovx.f16 s20, s0 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vins.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmovx.f16 s21, s4 -; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s20, s2 +; CHECK-NEXT: vmovx.f16 s2, s6 ; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s8, s10 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s21, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vins.f16 s21, s2 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vadd.i16 q0, q0, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index ab41069bfa258..ecb169898f9f0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -391,17 +391,18 @@ define void @vld4_v8i16_align1(ptr %src, ptr %dst) { ; CHECK-NEXT: vmovx.f16 s1, s2 ; CHECK-NEXT: vmovx.f16 s20, s8 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s10 -; CHECK-NEXT: vins.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmovx.f16 s10, s10 ; CHECK-NEXT: vmovx.f16 s21, s12 -; CHECK-NEXT: vmovx.f16 s1, s14 +; CHECK-NEXT: vins.f16 s20, s10 +; CHECK-NEXT: vmovx.f16 s10, s14 ; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vins.f16 s21, s1 -; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vins.f16 s21, s10 ; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vins.f16 s8, s1 +; CHECK-NEXT: vmov.f32 s9, s12 ; CHECK-NEXT: vmov.f32 s11, s0 ; CHECK-NEXT: vadd.i16 q0, q2, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll index 04be18e3dd873..6656d44eec81e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll @@ -344,14 +344,14 @@ define void @loop_absmax32_pred_c(ptr %0, i32 %1, ptr nocapture %2) { ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmaxnma.f32 q1, q0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vmaxnma.f32 q0, q1 ; CHECK-NEXT: letp lr, .LBB19_1 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: vldr s0, .LCPI19_0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmaxnmav.f32 r0, q1 +; CHECK-NEXT: vldr s4, .LCPI19_0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmaxnmav.f32 r0, q0 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vstr s0, [r2] ; CHECK-NEXT: pop {r7, pc} @@ -538,14 +538,14 @@ define void @loop_absmax16_pred_c(ptr %0, i32 %1, ptr nocapture %2) { ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.16 lr, r1 ; CHECK-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #8 -; CHECK-NEXT: vmaxnma.f16 q1, q0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0], #8 +; CHECK-NEXT: vmaxnma.f16 q0, q1 ; CHECK-NEXT: letp lr, .LBB23_1 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: vldr.16 s0, .LCPI23_0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmaxnmav.f16 r0, q1 +; CHECK-NEXT: vldr.16 s4, .LCPI23_0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmaxnmav.f16 r0, q0 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 26ab555c2c593..fb5f543fd0d3a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -1055,18 +1055,18 @@ define void @vst4_v4f16(ptr %src, ptr %dst) { ; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vmovx.f16 s2, s3 ; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s7 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vmovx.f16 s2, s7 ; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vins.f16 s10, s2 ; CHECK-NEXT: vmov.f32 s9, s1 ; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vmov.f32 s7, s12 ; CHECK-NEXT: vstrh.16 q1, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll index e6fcf56af6e8d..2929a04cc0637 100644 --- a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll +++ b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll @@ -63,8 +63,8 @@ define hidden i32 @f(i32 %n) local_unnamed_addr #0 { ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: sub.w r3, r4, #16 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r5, [r3, #16]! diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll index c9390d91d59c2..2b692bff0461e 100644 --- a/llvm/test/CodeGen/X86/3addr-16bit.ll +++ b/llvm/test/CodeGen/X86/3addr-16bit.ll @@ -10,27 +10,27 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test1: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: incl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: incl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB0_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB0_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test1: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: incl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB0_2 @@ -63,27 +63,27 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: decl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: decl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB1_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB1_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: decl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB1_2 @@ -118,27 +118,27 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test3: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl $2, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl $2, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB2_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB2_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test3: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl $2, %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB2_2 @@ -171,19 +171,19 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test4: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl %edi, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB3_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB3_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test4: @@ -191,8 +191,8 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: cmpw %cx, %dx ; X86-NEXT: jne LBB3_2 diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll index b4d40fee01e41..71887e369bd18 100644 --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll @@ -2156,15 +2156,17 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_brz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 ; X64-NEXT: .LBB34_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: xorl %edx, %ecx +; X64-NEXT: xorl %esi, %ecx ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: lock cmpxchgw %cx, (%rdi) ; X64-NEXT: # kill: def $ax killed $ax def $eax @@ -2172,12 +2174,12 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %ecx ; X64-NEXT: movw $123, %ax -; X64-NEXT: testl %ecx, %edx +; X64-NEXT: testl %ecx, %esi ; X64-NEXT: je .LBB34_3 ; X64-NEXT: # %bb.4: # %return ; X64-NEXT: retq ; X64-NEXT: .LBB34_3: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq entry: @@ -3398,10 +3400,12 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_brnz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movl $-2, %r8d +; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: roll %cl, %r8d ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 @@ -3415,10 +3419,10 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-NEXT: jne .LBB52_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: testl %eax, %edx +; X64-NEXT: testl %eax, %esi ; X64-NEXT: je .LBB52_3 ; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq ; X64-NEXT: .LBB52_3: diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll index 105ee7f82ee79..e118f5dbc1534 100644 --- a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll +++ b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll @@ -46,8 +46,9 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x ; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: lock cmpxchgl %ecx, (%rbx) ; CHECK-NEXT: setne %cl -; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: pinsrw $0, %edx, %xmm0 ; CHECK-NEXT: pinsrw $0, %eax, %xmm1 ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne .LBB0_1 diff --git a/llvm/test/CodeGen/X86/basic-block-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll index 45ef452f4f5c1..d652a540f3e9c 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-list.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll @@ -1,17 +1,13 @@ -;; Check the basic block sections list option. -;; version 0 profile: -; RUN: echo '!_Z3foob' > %t1 +;; Check that specifying the function in the basic block sections profile +;; without any other directives is a noop. ;; -;; version 1 profile: -; RUN: echo 'v1' > %t2 -; RUN: echo 'f _Z3foob' >> %t2 +;; Specify the bb sections profile: +; RUN: echo 'v1' > %t +; RUN: echo 'f _Z3foob' >> %t ;; -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t > %bbsections +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %orig +; RUN: diff -u %orig %bbsections define i32 @_Z3foob(i1 zeroext %0) nounwind { %2 = alloca i32, align 4 @@ -41,45 +37,3 @@ define i32 @_Z3foob(i1 zeroext %0) nounwind { declare i32 @_Z3barv() #1 declare i32 @_Z3bazv() #1 - -define i32 @_Z3zipb(i1 zeroext %0) nounwind { - %2 = alloca i32, align 4 - %3 = alloca i8, align 1 - %4 = zext i1 %0 to i8 - store i8 %4, ptr %3, align 1 - %5 = load i8, ptr %3, align 1 - %6 = trunc i8 %5 to i1 - %7 = zext i1 %6 to i32 - %8 = icmp sgt i32 %7, 0 - br i1 %8, label %9, label %11 - -9: ; preds = %1 - %10 = call i32 @_Z3barv() - store i32 %10, ptr %2, align 4 - br label %13 - -11: ; preds = %1 - %12 = call i32 @_Z3bazv() - store i32 %12, ptr %2, align 4 - br label %13 - -13: ; preds = %11, %9 - %14 = load i32, ptr %2, align 4 - ret i32 %14 -} - -; LINUX-SECTIONS-NO-GUIDED-PREFIX: .section .text._Z3foob,"ax",@progbits -; LINUX-SECTIONS: .section .text.hot._Z3foob,"ax",@progbits -; LINUX-SECTIONS: _Z3foob: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.1,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.1: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.2,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.2: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.3,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.3: - -; LINUX-SECTIONS-FUNCTION-SECTION: .section .text._Z3zipb,"ax",@progbits -; LINUX-SECTIONS-NO-FUNCTION-SECTION-NOT: .section .text{{.*}}._Z3zipb,"ax",@progbits -; LINUX-SECTIONS: _Z3zipb: -; LINUX-SECTIONS-NOT: .section .text{{.*}}._Z3zipb.__part.{{[0-9]+}},"ax",@progbits -; LINUX-SECTIONS-NOT: _Z3zipb.__part.{{[0-9]+}}: diff --git a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll index d481b147662dc..6e0db20ca0492 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll @@ -1,6 +1,8 @@ -; RUN: echo "!foo" > %t.order.txt -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt | FileCheck --check-prefix=SOURCE-DRIFT %s -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s +; RUN: echo "v1" > %t +; RUN: echo "f foo" >> %t +; RUN: echo "c 0" >> %t +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t | FileCheck --check-prefix=SOURCE-DRIFT %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s define dso_local i32 @foo(i1 zeroext %0, i1 zeroext %1) !annotation !1 { br i1 %0, label %5, label %3 diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 86d7df0c2d648..fae1ff90dd8d5 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -216,8 +216,8 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind { define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; SSE-LABEL: bitcast_v16i8_to_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -225,8 +225,8 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; ; AVX12-LABEL: bitcast_v16i8_to_v2i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpmovmskb %xmm0, %ecx -; AVX12-NEXT: movl %ecx, %eax +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: movl %eax, %ecx ; AVX12-NEXT: shrl $8, %eax ; AVX12-NEXT: addb %cl, %al ; AVX12-NEXT: # kill: def $al killed $al killed $eax @@ -441,8 +441,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: bitcast_v16i16_to_v2i8: ; SSE: # %bb.0: ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -452,8 +452,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -464,8 +464,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -762,8 +762,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -776,8 +776,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -793,8 +793,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll index 4d41c8406f6e0..a42a715bdc6ab 100644 --- a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll +++ b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll @@ -7,8 +7,8 @@ define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0_(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0_: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .p2align 4 @@ -68,8 +68,8 @@ _ZNK4llvm5APInt13getActiveBitsEv.exit.i.i: ; preds = %for.body.i.i.i.i.i define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0__assert(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0__assert: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB1_1 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll index c1beb7c803b2b..c9c88f7258435 100644 --- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll +++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll @@ -1031,31 +1031,30 @@ define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind { ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: je .LBB17_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $1, %r15d +; CHECK-NEXT: movl $1, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB17_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB17_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB17_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1199,32 +1198,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_ ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB21_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB21_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB21_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB21_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1251,32 +1249,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB22_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB22_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB22_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB22_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1303,31 +1300,30 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem( ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB23_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB23_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB23_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB23_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1404,32 +1400,31 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3 ; CHECK-NEXT: cmpl %edx, %edi ; CHECK-NEXT: jbe .LBB25_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movl %edx, %r15d -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edx, %ebx +; CHECK-NEXT: movl %esi, %ebp ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: addl $-2, %r15d +; CHECK-NEXT: addl $-2, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB25_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %ebx +; CHECK-NEXT: divl %ebp ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $-2, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB25_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB25_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index e223765eb887b..46b2571e196bb 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -490,20 +490,21 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind { define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_ashr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sarl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sarl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $6, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: sarl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: sarl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: sarl $6, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 %y = freeze i32 %x @@ -604,20 +605,21 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind { define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_lshr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: shrl $5, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: shrl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrl $5, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 %y = freeze i32 %x diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index cffd88c55bb0a..477a0dce5c81c 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -111,62 +111,63 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: orl %ecx, %eax ; X86-NOBMI-NEXT: je .LBB1_3 ; X86-NOBMI-NEXT: # %bb.1: # %for.body.preheader -; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: xorl %edx, %edx +; X86-NOBMI-NEXT: xorl %esi, %esi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: movl $0, (%esp) # 4-byte Folded Spill +; X86-NOBMI-NEXT: xorl %edi, %edi +; X86-NOBMI-NEXT: xorl %ebp, %ebp ; X86-NOBMI-NEXT: .p2align 4 ; X86-NOBMI-NEXT: .LBB1_2: # %for.body ; X86-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %edi -; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx +; X86-NOBMI-NEXT: movl (%eax,%edi,8), %ebp +; X86-NOBMI-NEXT: movl 4(%eax,%edi,8), %ebx ; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %edi, %eax -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %ebp, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %ebx, %eax -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebx -; X86-NOBMI-NEXT: movl %eax, %esi -; X86-NOBMI-NEXT: addl %ebp, %esi -; X86-NOBMI-NEXT: adcl $0, %ebx -; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: movl %eax, %ebx +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl $0, %edx +; X86-NOBMI-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx -; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %eax, %edi -; X86-NOBMI-NEXT: addl %esi, %edi -; X86-NOBMI-NEXT: adcl %ebx, %ebp -; X86-NOBMI-NEXT: setb %bl +; X86-NOBMI-NEXT: movl %eax, %ebp +; X86-NOBMI-NEXT: addl %ebx, %ebp +; X86-NOBMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ebx +; X86-NOBMI-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOBMI-NEXT: addl %ebp, %eax -; X86-NOBMI-NEXT: movzbl %bl, %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NOBMI-NEXT: adcl %esi, %edx -; X86-NOBMI-NEXT: movl %ecx, %ebx -; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl $0, %eax -; X86-NOBMI-NEXT: adcl $0, %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8) -; X86-NOBMI-NEXT: movl %ebx, %ecx -; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8) -; X86-NOBMI-NEXT: addl $1, %ecx -; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-NOBMI-NEXT: adcl $0, %edi -; X86-NOBMI-NEXT: movl %ecx, %esi -; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NOBMI-NEXT: xorl %ebp, %edi -; X86-NOBMI-NEXT: orl %esi, %edi +; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: addl %ebx, %esi +; X86-NOBMI-NEXT: movl %ecx, %eax +; X86-NOBMI-NEXT: movzbl (%esp), %ebx # 1-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ecx +; X86-NOBMI-NEXT: adcl %ebx, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl %eax, %ebp +; X86-NOBMI-NEXT: adcl $0, %esi +; X86-NOBMI-NEXT: adcl $0, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl %edx, (%eax,%edi,8) +; X86-NOBMI-NEXT: movl %ebp, 4(%eax,%edi,8) +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: addl $1, %edi +; X86-NOBMI-NEXT: adcl $0, %ebp +; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: xorl %edx, %eax +; X86-NOBMI-NEXT: movl %ebp, %edx +; X86-NOBMI-NEXT: xorl %ebx, %edx +; X86-NOBMI-NEXT: orl %eax, %edx ; X86-NOBMI-NEXT: jne .LBB1_2 ; X86-NOBMI-NEXT: .LBB1_3: # %for.end ; X86-NOBMI-NEXT: xorl %eax, %eax @@ -184,71 +185,66 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: pushl %ebx ; X86-BMI-NEXT: pushl %edi ; X86-BMI-NEXT: pushl %esi -; X86-BMI-NEXT: subl $20, %esp +; X86-BMI-NEXT: subl $16, %esp ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: orl %ecx, %eax ; X86-BMI-NEXT: je .LBB1_3 ; X86-BMI-NEXT: # %bb.1: # %for.body.preheader -; X86-BMI-NEXT: xorl %ecx, %ecx -; X86-BMI-NEXT: xorl %eax, %eax +; X86-BMI-NEXT: xorl %esi, %esi +; X86-BMI-NEXT: xorl %edi, %edi ; X86-BMI-NEXT: xorl %ebx, %ebx -; X86-BMI-NEXT: xorl %ebp, %ebp +; X86-BMI-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-BMI-NEXT: .p2align 4 ; X86-BMI-NEXT: .LBB1_2: # %for.body ; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: movl (%eax,%ebx,8), %ecx -; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %esi -; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %ebp +; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: mulxl %eax, %edx, %edi +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %edx, %eax +; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %eax, %esi, %eax -; X86-BMI-NEXT: addl %edi, %esi -; X86-BMI-NEXT: adcl $0, %eax +; X86-BMI-NEXT: movl %ebp, %edx +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %eax, %ebp +; X86-BMI-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, %ebp ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp -; X86-BMI-NEXT: addl %esi, %edi -; X86-BMI-NEXT: adcl %eax, %ebp +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %ecx, %edx +; X86-BMI-NEXT: addl %eax, %ecx +; X86-BMI-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl %esi, %eax +; X86-BMI-NEXT: adcl %ebp, %edx +; X86-BMI-NEXT: movl %edx, %ebp ; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %esi, %edi ; X86-BMI-NEXT: setb %dl -; X86-BMI-NEXT: addl %ebp, %ecx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI-NEXT: addl %ebp, %esi ; X86-BMI-NEXT: movzbl %dl, %edx -; X86-BMI-NEXT: adcl %edx, %eax -; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-BMI-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X86-BMI-NEXT: adcl $0, %ecx -; X86-BMI-NEXT: adcl $0, %edx -; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI-NEXT: movl %eax, (%edx,%ebx,8) -; X86-BMI-NEXT: movl %edi, 4(%edx,%ebx,8) -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-BMI-NEXT: adcl %edx, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-BMI-NEXT: addl %eax, %edx +; X86-BMI-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, %esi +; X86-BMI-NEXT: adcl $0, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %edx, (%eax,%ebx,8) +; X86-BMI-NEXT: movl %ecx, 4(%eax,%ebx,8) ; X86-BMI-NEXT: addl $1, %ebx -; X86-BMI-NEXT: adcl $0, %ebp -; X86-BMI-NEXT: movl %ebx, %edx -; X86-BMI-NEXT: xorl %esi, %edx -; X86-BMI-NEXT: movl %ebp, %esi -; X86-BMI-NEXT: xorl %edi, %esi -; X86-BMI-NEXT: orl %edx, %esi -; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-BMI-NEXT: adcl $0, %ecx +; X86-BMI-NEXT: movl %ebx, %eax +; X86-BMI-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: xorl %ebp, %ecx +; X86-BMI-NEXT: orl %eax, %ecx ; X86-BMI-NEXT: jne .LBB1_2 ; X86-BMI-NEXT: .LBB1_3: # %for.end ; X86-BMI-NEXT: xorl %eax, %eax ; X86-BMI-NEXT: xorl %edx, %edx -; X86-BMI-NEXT: addl $20, %esp +; X86-BMI-NEXT: addl $16, %esp ; X86-BMI-NEXT: popl %esi ; X86-BMI-NEXT: popl %edi ; X86-BMI-NEXT: popl %ebx @@ -261,11 +257,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-NOBMI-NEXT: je .LBB1_3 ; X64-NOBMI-NEXT: # %bb.1: # %for.body.preheader ; X64-NOBMI-NEXT: movq %rdx, %r8 -; X64-NOBMI-NEXT: xorl %r10d, %r10d +; X64-NOBMI-NEXT: xorl %edx, %edx ; X64-NOBMI-NEXT: xorl %r9d, %r9d ; X64-NOBMI-NEXT: .p2align 4 ; X64-NOBMI-NEXT: .LBB1_2: # %for.body ; X64-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: movq %rcx, %rax ; X64-NOBMI-NEXT: mulq (%r8,%r9,8) ; X64-NOBMI-NEXT: addq %r10, %rax @@ -273,7 +270,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-NOBMI-NEXT: movq %rax, (%rsi,%r9,8) ; X64-NOBMI-NEXT: incq %r9 ; X64-NOBMI-NEXT: cmpq %r9, %rdi -; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: jne .LBB1_2 ; X64-NOBMI-NEXT: .LBB1_3: # %for.end ; X64-NOBMI-NEXT: xorl %eax, %eax @@ -285,11 +281,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-BMI-NEXT: je .LBB1_3 ; X64-BMI-NEXT: # %bb.1: # %for.body.preheader ; X64-BMI-NEXT: movq %rdx, %rax -; X64-BMI-NEXT: xorl %r9d, %r9d +; X64-BMI-NEXT: xorl %edx, %edx ; X64-BMI-NEXT: xorl %r8d, %r8d ; X64-BMI-NEXT: .p2align 4 ; X64-BMI-NEXT: .LBB1_2: # %for.body ; X64-BMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: movq %rcx, %rdx ; X64-BMI-NEXT: mulxq (%rax,%r8,8), %r10, %rdx ; X64-BMI-NEXT: addq %r9, %r10 @@ -297,7 +294,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-BMI-NEXT: movq %r10, (%rsi,%r8,8) ; X64-BMI-NEXT: incq %r8 ; X64-BMI-NEXT: cmpq %r8, %rdi -; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: jne .LBB1_2 ; X64-BMI-NEXT: .LBB1_3: # %for.end ; X64-BMI-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll index 53b70fa38958b..c98889b7d5cb3 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -161,22 +161,22 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind { ; X86-LABEL: ne_and_with_dom_abs: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movswl %ax, %ecx +; X86-NEXT: sarl $15, %ecx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: xorl $12312, %eax # imm = 0x3018 ; X86-NEXT: movzwl %ax, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpw $64, %cx -; X86-NEXT: setne %cl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpw $64, %dx +; X86-NEXT: setne %dl ; X86-NEXT: cmpl $2345, %esi # imm = 0x929 ; X86-NEXT: jae .LBB3_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movb %dl, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB3_2: ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index caec02eaa19c7..2f691e7ca8f5b 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -255,9 +255,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} -; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} -; X64-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X64-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} +; X64-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test7: @@ -271,9 +271,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-KNL-NEXT: kmovw %k1, %k2 ; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} -; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} -; X86-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} +; X86-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X86-KNL-NEXT: retl ; ; X64-SKX-LABEL: test7: diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index a75d42ed0c50f..c058e37e0ce11 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -658,9 +658,9 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -710,9 +710,9 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setbe %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -765,9 +765,9 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %ecx +; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -817,11 +817,11 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_reg_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -871,12 +871,12 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_mem_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 73d459ba77026..8f97d2652bc53 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -403,11 +403,11 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: je .LBB3_1 ; X86-NEXT: # %bb.2: # %bb26.preheader ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_3: # %bb26 @@ -427,7 +427,6 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: jb .LBB3_3 ; X86-NEXT: jmp .LBB3_4 ; X86-NEXT: .LBB3_1: -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB3_4: # %bb31 ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll index b1aa789e53cd7..a663f6a1dd376 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i16.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll @@ -715,8 +715,8 @@ define i16 @test_mul_by_66(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 66 @@ -757,8 +757,8 @@ define i16 @test_mul_by_520(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $9, %eax -; X64-NEXT: leal (%rax,%rdi,8), %eax +; X64-NEXT: shll $9, %edi +; X64-NEXT: leal (%rdi,%rax,8), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 520 diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll index 79889b9ace406..4129b44ed3ddc 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i32.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll @@ -1155,16 +1155,16 @@ define i32 @test_mul_by_66(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $6, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax +; X64-HSW-NEXT: shll $6, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_66: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $6, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax +; X64-JAG-NEXT: shll $6, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_66: @@ -1241,16 +1241,16 @@ define i32 @test_mul_by_520(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $9, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,8), %eax +; X64-HSW-NEXT: shll $9, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,8), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_520: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $9, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,8), %eax +; X64-JAG-NEXT: shll $9, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,8), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_520: diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll index a4fa1ee8c0029..b488653655728 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i8.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -425,8 +425,8 @@ define i8 @test_mul_by_66(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 66 diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index 283c00e17f21a..b6af7e1641a9c 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: imull %ebp, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: imull %esi, %eax ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %ecx, (%esp) ## 4-byte Spill +; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: shrl $30, %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: sarl $2, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl %eax, %ebp +; CHECK-NEXT: sarl $31, %ebp +; CHECK-NEXT: shrl $30, %ebp +; CHECK-NEXT: addl %eax, %ebp +; CHECK-NEXT: sarl $2, %ebp +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_4: ## %bb6 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx -; CHECK-NEXT: movb %bl, (%edx,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %ebp, %esi +; CHECK-NEXT: movzbl (%eax,%edi,2), %ebx +; CHECK-NEXT: movb %bl, (%edx,%edi) +; CHECK-NEXT: incl %edi +; CHECK-NEXT: cmpl %esi, %edi ; CHECK-NEXT: jl LBB0_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: cmpl %edi, %ecx +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: je LBB0_12 ; CHECK-NEXT: ## %bb.6: ## %bb7.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: jmp LBB0_4 ; CHECK-NEXT: LBB0_12: ## %bb18.loopexit +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl (%esp), %eax ## 4-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %ebp, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: jle LBB0_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, %ebp +; CHECK-NEXT: cmpl $2, %esi ; CHECK-NEXT: jl LBB0_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl %ebp, %edx -; CHECK-NEXT: shrl $31, %edx -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: sarl %edx +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: shrl $31, %ebp +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: sarl %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $31, %ecx @@ -84,102 +84,103 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_10 Depth 2 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %edx, %edi ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB0_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%ecx,%esi) -; CHECK-NEXT: movzbl (%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%eax,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %edx, %esi +; CHECK-NEXT: movzbl -2(%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%ecx,%ebx) +; CHECK-NEXT: movzbl (%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%eax,%ebx) +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: cmpl %ebp, %ebx ; CHECK-NEXT: jl LBB0_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB0_9 Depth=1 ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: incl %edi -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: addl %ebp, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; CHECK-NEXT: jl LBB0_9 ; CHECK-NEXT: LBB0_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpl $1, %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %eax +; CHECK-NEXT: cmpl $3, %ecx ; CHECK-NEXT: jne LBB0_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; CHECK-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph -; CHECK-NEXT: leal 15(%edi), %eax +; CHECK-NEXT: leal 15(%edx), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebx, %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: addl %ecx, %ebx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: addl %ebp, %ebp +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: addl %ecx, %ebp +; CHECK-NEXT: addl %eax, %ebx +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi ; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %ebp, %ebx +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edi +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_17 ; CHECK-NEXT: LBB0_18: ## %bb26 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %esi, %edx ; CHECK-NEXT: jmp LBB0_23 ; CHECK-NEXT: LBB0_19: ## %bb29 -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 -; CHECK-NEXT: movl %edi, %esi -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -187,30 +188,32 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: LBB0_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx +; CHECK-NEXT: movl %ebp, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %edi -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %esi +; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: LBB0_22: ## %bb33 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx ; CHECK-NEXT: LBB0_23: ## %bb33 -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: sarl %eax ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl $128 -; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: pushl %edx ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: LBB0_25: ## %return @@ -523,38 +526,38 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB1_10 Depth 2 -; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: andl $1, %ebx ; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill -; CHECK-NEXT: addl %edx, %ebx -; CHECK-NEXT: imull {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB1_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%eax,%esi) -; CHECK-NEXT: movzbl (%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%ecx,%esi) +; CHECK-NEXT: movzbl -2(%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%eax,%esi) +; CHECK-NEXT: movzbl (%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%ecx,%esi) ; CHECK-NEXT: incl %esi ; CHECK-NEXT: cmpl %ebp, %esi ; CHECK-NEXT: jb LBB1_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB1_9 Depth=1 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: incl %ebx -; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: incl %edx +; CHECK-NEXT: addl %ebp, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl $2, %esi ; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: jb LBB1_9 ; CHECK-NEXT: LBB1_13: ## %bb20 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll index 420f5ba5ab433..31a7f1125150b 100644 --- a/llvm/test/CodeGen/X86/parity.ll +++ b/llvm/test/CodeGen/X86/parity.ll @@ -219,12 +219,12 @@ define i64 @parity_64(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -264,12 +264,12 @@ define i32 @parity_64_trunc(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64_trunc: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -628,12 +628,12 @@ define i64 @parity_64_shift(i64 %0) { ; ; X64-NOPOPCNT-LABEL: parity_64_shift: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll index 21b25d87796a5..ffdb68c7a6c01 100644 --- a/llvm/test/CodeGen/X86/pr166744.ll +++ b/llvm/test/CodeGen/X86/pr166744.ll @@ -31,13 +31,13 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) { ; NOPOSTRA-LABEL: PR166744: ; NOPOSTRA: # %bb.0: ; NOPOSTRA-NEXT: movl %esi, %eax -; NOPOSTRA-NEXT: shrl $3, %eax -; NOPOSTRA-NEXT: andl $60, %eax -; NOPOSTRA-NEXT: movl (%rdi,%rax), %ecx -; NOPOSTRA-NEXT: btrl %esi, %ecx -; NOPOSTRA-NEXT: shlxl %esi, %edx, %edx -; NOPOSTRA-NEXT: orl %ecx, %edx -; NOPOSTRA-NEXT: movl %edx, (%rdi,%rax) +; NOPOSTRA-NEXT: shrl $3, %esi +; NOPOSTRA-NEXT: andl $60, %esi +; NOPOSTRA-NEXT: movl (%rdi,%rsi), %ecx +; NOPOSTRA-NEXT: btrl %eax, %ecx +; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax +; NOPOSTRA-NEXT: orl %ecx, %eax +; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi) ; NOPOSTRA-NEXT: movq 16(%rdi), %rax ; NOPOSTRA-NEXT: movq (%rdi), %rcx ; NOPOSTRA-NEXT: movq 8(%rdi), %rdx diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll index 8f046a4f5aea5..26e68861cf45c 100644 --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -203,10 +203,10 @@ define i16 @no_extract_mul(i16 %i) nounwind { ; X64-LABEL: no_extract_mul: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $8, %edi ; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: leal (%rax,%rax,8), %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: shrl $9, %eax ; X64-NEXT: orl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index ce56283df6010..8cb032776114b 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movslq %esi, %rax -; X64-NEXT: movslq %edi, %rcx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movslq %esi, %rcx +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll index e5ea911d4771a..a93be22bf5861 100644 --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -15,16 +15,16 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movswl %di, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %esi ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testw %di, %di +; X64-NEXT: testw %dx, %dx ; X64-NEXT: sets %al ; X64-NEXT: addl $32767, %eax # imm = 0x7FFF -; X64-NEXT: cmpw %si, %di -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; @@ -33,17 +33,17 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movswl %dx, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %al ; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx -; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -58,18 +58,18 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax -; X64-NEXT: xorl %edx, %edx +; X64-NEXT: movl %eax, %edx +; X64-NEXT: xorl %esi, %esi ; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %dl -; X64-NEXT: addl $32767, %edx # imm = 0x7FFF -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movswl %si, %edi +; X64-NEXT: sets %sil +; X64-NEXT: addl $32767, %esi # imm = 0x7FFF +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movswl %ax, %edi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %edi -; X64-NEXT: cmpw %di, %ax -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movswl %si, %eax +; X64-NEXT: cmpw %di, %dx +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: cwtl ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index 10dee14bdd1a0..ff76707bdbb69 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -365,119 +365,118 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movswl %bx, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movswl %di, %ebp ; X86-NEXT: sarl %cl, %ebp ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %di, %di +; X86-NEXT: testw %bx, %bx ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %bp, %di -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovel %ebx, %ecx +; X86-NEXT: cmpw %bp, %bx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: cmovel %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movswl %di, %ebx -; X86-NEXT: sarl %cl, %ebx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %bx, %si ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi ; X86-NEXT: shll %cl, %esi ; X86-NEXT: movswl %si, %edi ; X86-NEXT: sarl %cl, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %bx, %bx +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: cmpw %di, %bx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx +; X86-NEXT: cmovel %esi, %ebp ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movswl %dx, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %di, %di ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %di +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edx +; X86-NEXT: sarl %cl, %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmpw %dx, %si +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: cmovel %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi -; X86-NEXT: sarl %cl, %esi +; X86-NEXT: movswl %dx, %eax +; X86-NEXT: sarl %cl, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmpw %ax, %si ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovel %edx, %ecx -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %dx, %dx ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: cmovel %edx, %ebx +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: cmovel %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %dl ; X86-NEXT: addl $32767, %edx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %ecx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movw %cx, 14(%eax) ; X86-NEXT: movw %dx, 12(%eax) ; X86-NEXT: movw %bx, 10(%eax) -; X86-NEXT: movw %bp, 8(%eax) ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 6(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movw %cx, 2(%eax) +; X86-NEXT: movw %bp, 2(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $16, %esp diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll index 72406aaa4efa8..9bf88cb8bdf81 100644 --- a/llvm/test/CodeGen/X86/stackmap.ll +++ b/llvm/test/CodeGen/X86/stackmap.ll @@ -1,7 +1,10 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -terminal-rule=0 | FileCheck %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. +; FIXME: Test should be fixed to produce the correct sized spill with +; -terminal-rule=0 flag removed + ; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps ; CHECK-NEXT: __LLVM_StackMaps: ; Header @@ -546,8 +549,8 @@ define void @clobberScratch(i32 %a) { ret void } -; A stack frame which needs to be realigned at runtime (to meet alignment -; criteria for values on the stack) does not have a fixed frame size. +; A stack frame which needs to be realigned at runtime (to meet alignment +; criteria for values on the stack) does not have a fixed frame size. ; CHECK-LABEL: .long L{{.*}}-_needsStackRealignment ; CHECK-NEXT: .short 0 ; 0 locations diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 5bd624c0697a0..01fbafb18eb9f 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -2429,126 +2429,126 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE2-ONLY: # %bb.0: ; SSE2-ONLY-NEXT: movl (%rdi), %eax ; SSE2-ONLY-NEXT: notl %eax -; SSE2-ONLY-NEXT: movw %ax, (%rsi) ; SSE2-ONLY-NEXT: movl %eax, %ecx -; SSE2-ONLY-NEXT: shrl $16, %ecx -; SSE2-ONLY-NEXT: movb %cl, 2(%rsi) -; SSE2-ONLY-NEXT: movb %cl, 2(%rdx) -; SSE2-ONLY-NEXT: movw %ax, (%rdx) -; SSE2-ONLY-NEXT: movb %cl, 6(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 10(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 8(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 14(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 18(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 16(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 22(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 26(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 24(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 30(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 34(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 32(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 38(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 42(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 40(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 46(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 50(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 48(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 54(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 58(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 56(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 62(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) +; SSE2-ONLY-NEXT: movw %ax, (%rsi) +; SSE2-ONLY-NEXT: shrl $16, %eax +; SSE2-ONLY-NEXT: movb %al, 2(%rsi) +; SSE2-ONLY-NEXT: movb %al, 2(%rdx) +; SSE2-ONLY-NEXT: movw %cx, (%rdx) +; SSE2-ONLY-NEXT: movb %al, 6(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 4(%rdx) +; SSE2-ONLY-NEXT: movb %al, 10(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 8(%rdx) +; SSE2-ONLY-NEXT: movb %al, 14(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 12(%rdx) +; SSE2-ONLY-NEXT: movb %al, 18(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 16(%rdx) +; SSE2-ONLY-NEXT: movb %al, 22(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 20(%rdx) +; SSE2-ONLY-NEXT: movb %al, 26(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 24(%rdx) +; SSE2-ONLY-NEXT: movb %al, 30(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 28(%rdx) +; SSE2-ONLY-NEXT: movb %al, 34(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 32(%rdx) +; SSE2-ONLY-NEXT: movb %al, 38(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 36(%rdx) +; SSE2-ONLY-NEXT: movb %al, 42(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 40(%rdx) +; SSE2-ONLY-NEXT: movb %al, 46(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 44(%rdx) +; SSE2-ONLY-NEXT: movb %al, 50(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 48(%rdx) +; SSE2-ONLY-NEXT: movb %al, 54(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 52(%rdx) +; SSE2-ONLY-NEXT: movb %al, 58(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 56(%rdx) +; SSE2-ONLY-NEXT: movb %al, 62(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 60(%rdx) ; SSE2-ONLY-NEXT: retq ; ; SSE3-LABEL: vec384_v3i8: ; SSE3: # %bb.0: ; SSE3-NEXT: movl (%rdi), %eax ; SSE3-NEXT: notl %eax -; SSE3-NEXT: movw %ax, (%rsi) ; SSE3-NEXT: movl %eax, %ecx -; SSE3-NEXT: shrl $16, %ecx -; SSE3-NEXT: movb %cl, 2(%rsi) -; SSE3-NEXT: movb %cl, 2(%rdx) -; SSE3-NEXT: movw %ax, (%rdx) -; SSE3-NEXT: movb %cl, 6(%rdx) -; SSE3-NEXT: movw %ax, 4(%rdx) -; SSE3-NEXT: movb %cl, 10(%rdx) -; SSE3-NEXT: movw %ax, 8(%rdx) -; SSE3-NEXT: movb %cl, 14(%rdx) -; SSE3-NEXT: movw %ax, 12(%rdx) -; SSE3-NEXT: movb %cl, 18(%rdx) -; SSE3-NEXT: movw %ax, 16(%rdx) -; SSE3-NEXT: movb %cl, 22(%rdx) -; SSE3-NEXT: movw %ax, 20(%rdx) -; SSE3-NEXT: movb %cl, 26(%rdx) -; SSE3-NEXT: movw %ax, 24(%rdx) -; SSE3-NEXT: movb %cl, 30(%rdx) -; SSE3-NEXT: movw %ax, 28(%rdx) -; SSE3-NEXT: movb %cl, 34(%rdx) -; SSE3-NEXT: movw %ax, 32(%rdx) -; SSE3-NEXT: movb %cl, 38(%rdx) -; SSE3-NEXT: movw %ax, 36(%rdx) -; SSE3-NEXT: movb %cl, 42(%rdx) -; SSE3-NEXT: movw %ax, 40(%rdx) -; SSE3-NEXT: movb %cl, 46(%rdx) -; SSE3-NEXT: movw %ax, 44(%rdx) -; SSE3-NEXT: movb %cl, 50(%rdx) -; SSE3-NEXT: movw %ax, 48(%rdx) -; SSE3-NEXT: movb %cl, 54(%rdx) -; SSE3-NEXT: movw %ax, 52(%rdx) -; SSE3-NEXT: movb %cl, 58(%rdx) -; SSE3-NEXT: movw %ax, 56(%rdx) -; SSE3-NEXT: movb %cl, 62(%rdx) -; SSE3-NEXT: movw %ax, 60(%rdx) +; SSE3-NEXT: movw %ax, (%rsi) +; SSE3-NEXT: shrl $16, %eax +; SSE3-NEXT: movb %al, 2(%rsi) +; SSE3-NEXT: movb %al, 2(%rdx) +; SSE3-NEXT: movw %cx, (%rdx) +; SSE3-NEXT: movb %al, 6(%rdx) +; SSE3-NEXT: movw %cx, 4(%rdx) +; SSE3-NEXT: movb %al, 10(%rdx) +; SSE3-NEXT: movw %cx, 8(%rdx) +; SSE3-NEXT: movb %al, 14(%rdx) +; SSE3-NEXT: movw %cx, 12(%rdx) +; SSE3-NEXT: movb %al, 18(%rdx) +; SSE3-NEXT: movw %cx, 16(%rdx) +; SSE3-NEXT: movb %al, 22(%rdx) +; SSE3-NEXT: movw %cx, 20(%rdx) +; SSE3-NEXT: movb %al, 26(%rdx) +; SSE3-NEXT: movw %cx, 24(%rdx) +; SSE3-NEXT: movb %al, 30(%rdx) +; SSE3-NEXT: movw %cx, 28(%rdx) +; SSE3-NEXT: movb %al, 34(%rdx) +; SSE3-NEXT: movw %cx, 32(%rdx) +; SSE3-NEXT: movb %al, 38(%rdx) +; SSE3-NEXT: movw %cx, 36(%rdx) +; SSE3-NEXT: movb %al, 42(%rdx) +; SSE3-NEXT: movw %cx, 40(%rdx) +; SSE3-NEXT: movb %al, 46(%rdx) +; SSE3-NEXT: movw %cx, 44(%rdx) +; SSE3-NEXT: movb %al, 50(%rdx) +; SSE3-NEXT: movw %cx, 48(%rdx) +; SSE3-NEXT: movb %al, 54(%rdx) +; SSE3-NEXT: movw %cx, 52(%rdx) +; SSE3-NEXT: movb %al, 58(%rdx) +; SSE3-NEXT: movw %cx, 56(%rdx) +; SSE3-NEXT: movb %al, 62(%rdx) +; SSE3-NEXT: movw %cx, 60(%rdx) ; SSE3-NEXT: retq ; ; SSSE3-ONLY-LABEL: vec384_v3i8: ; SSSE3-ONLY: # %bb.0: ; SSSE3-ONLY-NEXT: movl (%rdi), %eax ; SSSE3-ONLY-NEXT: notl %eax -; SSSE3-ONLY-NEXT: movw %ax, (%rsi) ; SSSE3-ONLY-NEXT: movl %eax, %ecx -; SSSE3-ONLY-NEXT: shrl $16, %ecx -; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi) -; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, (%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, (%rsi) +; SSSE3-ONLY-NEXT: shrl $16, %eax +; SSSE3-ONLY-NEXT: movb %al, 2(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 2(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, (%rdx) +; SSSE3-ONLY-NEXT: movb %al, 6(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 4(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 10(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 8(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 14(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 12(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 18(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 16(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 22(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 20(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 26(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 24(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 30(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 28(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 34(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 32(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 38(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 36(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 42(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 40(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 46(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 44(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 50(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 48(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 54(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 52(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 58(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 56(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 62(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 60(%rdx) ; SSSE3-ONLY-NEXT: retq ; ; SSE41-LABEL: vec384_v3i8: diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll index f20b777531c5a..3ad3e9a0e7655 100644 --- a/llvm/test/CodeGen/X86/twoaddr-lea.ll +++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll @@ -65,10 +65,10 @@ entry: define void @ham() { ; CHECK-LABEL: ham: ; CHECK: ## %bb.0: ## %bb +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movq _global@GOTPCREL(%rip), %rdx ; CHECK-NEXT: movq _global2@GOTPCREL(%rip), %rsi -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: je LBB3_2 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index eacc714b49a4d..5a68484596a2f 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll index e0e1ef7108d0d..9768e4761f47a 100644 --- a/llvm/test/CodeGen/X86/ushl_sat.ll +++ b/llvm/test/CodeGen/X86/ushl_sat.ll @@ -14,23 +14,23 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %eax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movzwl %di, %edx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %eax -; X64-NEXT: cmpw %ax, %di +; X64-NEXT: shrl %cl, %edx +; X64-NEXT: cmpw %dx, %ax ; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X86-LABEL: func: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movzwl %dx, %esi ; X86-NEXT: shrl %cl, %esi @@ -51,14 +51,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax ; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %esi +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movzwl %ax, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %esi -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax -; X64-NEXT: cwtl +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X64-NEXT: cmovel %eax, %ecx +; X64-NEXT: movswl %cx, %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index b8e83da9cf361..762088cfb2935 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -300,95 +300,94 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %si +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %dx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $65535, %esi # imm = 0xFFFF -; X86-NEXT: cmovnel %esi, %eax +; X86-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %bp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movzwl %bp, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %si -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bp, %eax +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: cmpw %ax, %di +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %eax, %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bx, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $65535, %esi # imm = 0xFFFF ; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movzwl %bp, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: shll %cl, %edi -; X86-NEXT: movzwl %di, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax ; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl $65535, %ebx # imm = 0xFFFF ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movzwl %si, %eax +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzwl %dx, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: cmpw %ax, %si +; X86-NEXT: cmovnel %ebx, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %bx ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) -; X86-NEXT: movw %si, 12(%ecx) +; X86-NEXT: movw %dx, 12(%ecx) ; X86-NEXT: movw %di, 10(%ecx) -; X86-NEXT: movw %bx, 8(%ecx) -; X86-NEXT: movw %bp, 6(%ecx) +; X86-NEXT: movw %bp, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 6(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 2(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll index b233855029c58..324fe12de9400 100644 --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -85,14 +85,14 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) { ; CHECK-NEXT: movswl %dx, %edx ; CHECK-NEXT: leal (,%rdx,4), %esi ; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: shrl $16, %esi +; CHECK-NEXT: shldw $1, %di, %si ; CHECK-NEXT: sarl $14, %edx ; CHECK-NEXT: cmpl $16384, %edx # imm = 0x4000 -; CHECK-NEXT: cmovgel %eax, %edi +; CHECK-NEXT: cmovgel %eax, %esi ; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000 -; CHECK-NEXT: cmovll %ecx, %edi -; CHECK-NEXT: pinsrw $3, %edi, %xmm1 +; CHECK-NEXT: cmovll %ecx, %esi +; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> , <4 x i16> %a, i32 15) @@ -106,19 +106,19 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $2, %xmm0, %eax ; CHECK-NEXT: leal (%rax,%rax,2), %eax ; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: shldw $1, %ax, %cx -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shldw $1, %dx, %cx +; CHECK-NEXT: cmpl $32768, %eax # imm = 0x8000 ; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF ; CHECK-NEXT: cmovael %eax, %ecx ; CHECK-NEXT: pextrw $1, %xmm0, %edx ; CHECK-NEXT: addl %edx, %edx ; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shrl $16, %esi -; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shldw $1, %dx, %di -; CHECK-NEXT: cmpl $32768, %esi # imm = 0x8000 +; CHECK-NEXT: shrl $16, %edx +; CHECK-NEXT: movl %edx, %edi +; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %edi ; CHECK-NEXT: movd %xmm0, %edx ; CHECK-NEXT: xorl %esi, %esi @@ -133,10 +133,10 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $3, %xmm0, %ecx ; CHECK-NEXT: shll $2, %ecx ; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shldw $1, %cx, %si -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl %ecx, %esi +; CHECK-NEXT: shldw $1, %dx, %si +; CHECK-NEXT: cmpl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %esi ; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 320dce840ea57..6cb43234d713b 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -397,8 +397,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -409,8 +409,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -421,8 +421,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vpmovw2m %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -722,8 +722,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -734,8 +734,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -746,8 +746,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -974,13 +974,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -990,13 +990,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovb2m %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1211,8 +1211,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1222,8 +1222,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, zeroinitializer @@ -1427,8 +1427,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1439,8 +1439,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1450,8 +1450,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vptestnmw %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1756,8 +1756,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1767,8 +1767,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1778,8 +1778,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2010,13 +2010,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2025,13 +2025,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2240,8 +2240,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2251,8 +2251,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512VL-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, %1 @@ -2504,8 +2504,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2517,8 +2517,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2528,8 +2528,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2845,8 +2845,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2856,8 +2856,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2867,8 +2867,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512VL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -3097,13 +3097,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3112,13 +3112,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 3c98eba69ae5b..1c3d27fac4203 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -777,31 +777,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, (%esp) +; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: andb $12, %bl -; FALLBACK18-NEXT: movzbl %bl, %esi -; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi -; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx -; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: orl %ebp, %ecx -; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %ebp, %edi -; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx -; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi -; FALLBACK18-NEXT: shrxl %eax, %esi, %eax -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %edx -; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: movzbl %bl, %edi +; FALLBACK18-NEXT: movl 4(%esp,%edi), %ebx +; FALLBACK18-NEXT: movl 8(%esp,%edi), %esi +; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebp +; FALLBACK18-NEXT: notb %al +; FALLBACK18-NEXT: leal (%esi,%esi), %edx +; FALLBACK18-NEXT: shlxl %eax, %edx, %edx +; FALLBACK18-NEXT: orl %ebp, %edx +; FALLBACK18-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; FALLBACK18-NEXT: addl %ebx, %ebx +; FALLBACK18-NEXT: shlxl %eax, %ebx, %ebx +; FALLBACK18-NEXT: orl %ebp, %ebx +; FALLBACK18-NEXT: movl 12(%esp,%edi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ebp +; FALLBACK18-NEXT: shlxl %eax, %ebp, %eax +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK18-NEXT: movl %eax, 12(%esi) -; FALLBACK18-NEXT: movl %edx, 8(%esi) -; FALLBACK18-NEXT: movl %edi, (%esi) -; FALLBACK18-NEXT: movl %ecx, 4(%esi) +; FALLBACK18-NEXT: movl %ecx, 12(%esi) +; FALLBACK18-NEXT: movl %eax, 8(%esi) +; FALLBACK18-NEXT: movl %ebx, (%esi) +; FALLBACK18-NEXT: movl %edx, 4(%esi) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -962,42 +962,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $44, %esp +; FALLBACK22-NEXT: subl $60, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, (%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: notb %cl -; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK22-NEXT: orl %ebx, %edx -; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebx, %edi -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: andb $12, %dl +; FALLBACK22-NEXT: movzbl %dl, %edi +; FALLBACK22-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK22-NEXT: notb %al +; FALLBACK22-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %eax, %edx, %edx +; FALLBACK22-NEXT: orl %ebp, %edx +; FALLBACK22-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK22-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK22-NEXT: shlxl %eax, %edi, %edi +; FALLBACK22-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, 12(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) +; FALLBACK22-NEXT: movl %ecx, 12(%esi) +; FALLBACK22-NEXT: movl %eax, 4(%esi) ; FALLBACK22-NEXT: movl %edi, 8(%esi) ; FALLBACK22-NEXT: movl %edx, (%esi) -; FALLBACK22-NEXT: addl $44, %esp +; FALLBACK22-NEXT: addl $60, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx @@ -1152,42 +1153,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $44, %esp +; FALLBACK26-NEXT: subl $60, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: notb %cl -; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK26-NEXT: orl %ebx, %edx -; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebx, %edi -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: andb $12, %dl +; FALLBACK26-NEXT: movzbl %dl, %edi +; FALLBACK26-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %eax, %edx, %edx +; FALLBACK26-NEXT: orl %ebp, %edx +; FALLBACK26-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK26-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK26-NEXT: shlxl %eax, %edi, %edi +; FALLBACK26-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, 12(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) +; FALLBACK26-NEXT: movl %ecx, 12(%esi) +; FALLBACK26-NEXT: movl %eax, 4(%esi) ; FALLBACK26-NEXT: movl %edi, 8(%esi) ; FALLBACK26-NEXT: movl %edx, (%esi) -; FALLBACK26-NEXT: addl $44, %esp +; FALLBACK26-NEXT: addl $60, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx @@ -1342,42 +1344,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $44, %esp +; FALLBACK30-NEXT: subl $60, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: notb %cl -; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK30-NEXT: orl %ebx, %edx -; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebx, %edi -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: andb $12, %dl +; FALLBACK30-NEXT: movzbl %dl, %edi +; FALLBACK30-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %eax, %edx, %edx +; FALLBACK30-NEXT: orl %ebp, %edx +; FALLBACK30-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK30-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK30-NEXT: shlxl %eax, %edi, %edi +; FALLBACK30-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, 12(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) +; FALLBACK30-NEXT: movl %ecx, 12(%esi) +; FALLBACK30-NEXT: movl %eax, 4(%esi) ; FALLBACK30-NEXT: movl %edi, 8(%esi) ; FALLBACK30-NEXT: movl %edx, (%esi) -; FALLBACK30-NEXT: addl $44, %esp +; FALLBACK30-NEXT: addl $60, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx @@ -1784,41 +1787,41 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl 4(%ecx), %esi ; FALLBACK18-NEXT: movl 8(%ecx), %edi ; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %eax -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: shlb $3, %bl +; FALLBACK18-NEXT: movzbl (%eax), %ebx +; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: shlb $3, %al ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, (%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $12, %al -; FALLBACK18-NEXT: negb %al -; FALLBACK18-NEXT: movsbl %al, %edx -; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi -; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: movl %eax, %ecx +; FALLBACK18-NEXT: andb $12, %bl +; FALLBACK18-NEXT: negb %bl +; FALLBACK18-NEXT: movsbl %bl, %esi +; FALLBACK18-NEXT: movl 16(%esp,%esi), %ebx +; FALLBACK18-NEXT: movl 20(%esp,%esi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edi ; FALLBACK18-NEXT: notb %al -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %eax, %edi, %edi -; FALLBACK18-NEXT: orl %esi, %edi -; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx -; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK18-NEXT: shrl %ebx +; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; FALLBACK18-NEXT: movl 24(%esp,%esi), %esi +; FALLBACK18-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %eax, %esi, %esi +; FALLBACK18-NEXT: orl %edi, %esi ; FALLBACK18-NEXT: shrl %edx -; FALLBACK18-NEXT: shrxl %eax, %edx, %edx -; FALLBACK18-NEXT: orl %esi, %edx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax -; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: shrxl %eax, %edx, %eax +; FALLBACK18-NEXT: orl %ecx, %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK18-NEXT: movl %ebp, (%ecx) ; FALLBACK18-NEXT: movl %eax, 8(%ecx) -; FALLBACK18-NEXT: movl %edx, 12(%ecx) -; FALLBACK18-NEXT: movl %edi, 4(%ecx) +; FALLBACK18-NEXT: movl %esi, 12(%ecx) +; FALLBACK18-NEXT: movl %ebx, 4(%ecx) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -1983,39 +1986,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, (%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %ecx -; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %esi, %edx -; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %esi, %ebp +; FALLBACK22-NEXT: movl %eax, %ecx +; FALLBACK22-NEXT: andb $12, %dl +; FALLBACK22-NEXT: negb %dl +; FALLBACK22-NEXT: movsbl %dl, %edx +; FALLBACK22-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK22-NEXT: notb %al +; FALLBACK22-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %eax, %esi, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK22-NEXT: movl %edi, %ebp ; FALLBACK22-NEXT: shrl %ebp -; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, (%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) -; FALLBACK22-NEXT: movl %edx, 12(%esi) +; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK22-NEXT: orl %ebx, %ebp +; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK22-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK22-NEXT: shrl %edx +; FALLBACK22-NEXT: shrxl %eax, %edx, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, (%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %ebp, 8(%edx) +; FALLBACK22-NEXT: movl %esi, 12(%edx) ; FALLBACK22-NEXT: addl $44, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -2175,39 +2178,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %ecx -; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %edi -; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %esi, %edx -; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %esi, %ebp +; FALLBACK26-NEXT: movl %eax, %ecx +; FALLBACK26-NEXT: andb $12, %dl +; FALLBACK26-NEXT: negb %dl +; FALLBACK26-NEXT: movsbl %dl, %edx +; FALLBACK26-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %eax, %esi, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK26-NEXT: movl %edi, %ebp ; FALLBACK26-NEXT: shrl %ebp -; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK26-NEXT: orl %edi, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, (%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) -; FALLBACK26-NEXT: movl %edx, 12(%esi) +; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK26-NEXT: orl %ebx, %ebp +; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK26-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK26-NEXT: shrl %edx +; FALLBACK26-NEXT: shrxl %eax, %edx, %eax +; FALLBACK26-NEXT: orl %edi, %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %ecx, (%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %ebp, 8(%edx) +; FALLBACK26-NEXT: movl %esi, 12(%edx) ; FALLBACK26-NEXT: addl $44, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -2367,39 +2370,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %ecx -; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %edi -; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %esi, %edx -; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %esi, %ebp +; FALLBACK30-NEXT: movl %eax, %ecx +; FALLBACK30-NEXT: andb $12, %dl +; FALLBACK30-NEXT: negb %dl +; FALLBACK30-NEXT: movsbl %dl, %edx +; FALLBACK30-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %eax, %esi, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK30-NEXT: movl %edi, %ebp ; FALLBACK30-NEXT: shrl %ebp -; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK30-NEXT: orl %edi, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, (%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) -; FALLBACK30-NEXT: movl %edx, 12(%esi) +; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK30-NEXT: orl %ebx, %ebp +; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK30-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK30-NEXT: shrl %edx +; FALLBACK30-NEXT: shrxl %eax, %edx, %eax +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, (%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %ebp, 8(%edx) +; FALLBACK30-NEXT: movl %esi, 12(%edx) ; FALLBACK30-NEXT: addl $44, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -2833,31 +2836,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl -; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl -; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%edi), %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%esi,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%edi), %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %edi, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%esi) ; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi @@ -3208,30 +3211,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes: @@ -3355,30 +3358,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %esi ; FALLBACK6-NEXT: andb $24, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes: @@ -3487,35 +3490,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-LABEL: lshr_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: movzbl (%rsi), %eax +; FALLBACK10-NEXT: leal (,%rax,8), %ecx ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movl %ecx, %esi +; FALLBACK10-NEXT: andb $24, %al +; FALLBACK10-NEXT: movzbl %al, %eax +; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rax), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %rdi, %rcx +; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 24(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -3623,35 +3626,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-LABEL: lshr_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: movzbl (%rsi), %eax +; FALLBACK14-NEXT: leal (,%rax,8), %ecx ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movl %ecx, %esi +; FALLBACK14-NEXT: andb $24, %al +; FALLBACK14-NEXT: movzbl %al, %eax +; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rax), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %rdi, %rcx +; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 24(%rdx) +; FALLBACK14-NEXT: movq %rcx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -3914,81 +3917,75 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl %ebx, %ecx +; FALLBACK18-NEXT: shlb $3, %cl ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, %eax ; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: movzbl %bl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %esi, %edx +; FALLBACK18-NEXT: movzbl %bl, %esi +; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx +; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: orl %edi, %edx ; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %ebx, %ecx -; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx +; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %esi, %esi +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK18-NEXT: movl %esi, 28(%edi) +; FALLBACK18-NEXT: movl %ecx, 24(%edi) +; FALLBACK18-NEXT: movl %eax, 16(%edi) +; FALLBACK18-NEXT: movl %edx, 20(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edi) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -4261,72 +4258,70 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlb $3, %dl +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: shlb $3, %cl ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %edx, %eax -; FALLBACK22-NEXT: notb %al -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl %eax, %ebp -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: movzbl %dl, %ebx +; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK22-NEXT: movl %eax, %edi +; FALLBACK22-NEXT: notb %cl +; FALLBACK22-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK22-NEXT: movl %ebp, %ecx -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK22-NEXT: orl %edx, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK22-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK22-NEXT: shrxl %edi, %edx, %esi +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK22-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edi, %edx, %esi +; FALLBACK22-NEXT: movl %edi, %edx +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebp +; FALLBACK22-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK22-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK22-NEXT: shrxl %edi, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebx ; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebp, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: orl %eax, %ebx ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, 28(%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %ebx, 24(%edx) +; FALLBACK22-NEXT: movl %edi, 16(%edx) +; FALLBACK22-NEXT: movl %ebp, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4585,70 +4580,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlb $3, %dl +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %ecx +; FALLBACK26-NEXT: shlb $3, %cl ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %edx, %eax -; FALLBACK26-NEXT: notb %al -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl %eax, %ebp -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: movzbl %dl, %ebx +; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK26-NEXT: movl %eax, %edi +; FALLBACK26-NEXT: notb %cl +; FALLBACK26-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK26-NEXT: movl %ebp, %ecx -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK26-NEXT: orl %edx, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK26-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK26-NEXT: shrxl %edi, %edx, %esi +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK26-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edi, %edx, %esi +; FALLBACK26-NEXT: movl %edi, %edx +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebp +; FALLBACK26-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK26-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK26-NEXT: shrxl %edi, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebx ; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebp, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) +; FALLBACK26-NEXT: orl %eax, %ebx ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %ecx, 28(%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %ebx, 24(%edx) +; FALLBACK26-NEXT: movl %edi, 16(%edx) +; FALLBACK26-NEXT: movl %ebp, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4906,70 +4899,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlb $3, %dl +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %ecx +; FALLBACK30-NEXT: shlb $3, %cl ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %edx, %eax -; FALLBACK30-NEXT: notb %al -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl %eax, %ebp -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: movzbl %dl, %ebx +; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK30-NEXT: movl %eax, %edi +; FALLBACK30-NEXT: notb %cl +; FALLBACK30-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK30-NEXT: movl %ebp, %ecx -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK30-NEXT: orl %edx, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK30-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK30-NEXT: shrxl %edi, %edx, %esi +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK30-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edi, %edx, %esi +; FALLBACK30-NEXT: movl %edi, %edx +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebp +; FALLBACK30-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK30-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK30-NEXT: shrxl %edi, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebx ; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebp, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: orl %eax, %ebx ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, 28(%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %ebx, 24(%edx) +; FALLBACK30-NEXT: movl %edi, 16(%edx) +; FALLBACK30-NEXT: movl %ebp, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5157,30 +5148,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes_dwordOff: @@ -5307,30 +5298,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %esi ; FALLBACK6-NEXT: andb $6, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx,4), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes_dwordOff: @@ -5441,36 +5432,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK10-LABEL: lshr_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax -; FALLBACK10-NEXT: shlb $5, %al +; FALLBACK10-NEXT: movzbl (%rsi), %eax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: shlb $5, %cl ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $6, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movl %ecx, %esi +; FALLBACK10-NEXT: andb $6, %al +; FALLBACK10-NEXT: movzbl %al, %eax +; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rax,4), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %rdi, %rcx +; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 24(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -5580,36 +5571,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK14-LABEL: lshr_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax -; FALLBACK14-NEXT: shlb $5, %al +; FALLBACK14-NEXT: movzbl (%rsi), %eax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: shlb $5, %cl ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $6, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movl %ecx, %esi +; FALLBACK14-NEXT: andb $6, %al +; FALLBACK14-NEXT: movzbl %al, %eax +; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rax,4), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %rdi, %rcx +; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 24(%rdx) +; FALLBACK14-NEXT: movq %rcx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -6025,31 +6016,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movsbq %sil, %rdi +; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 +; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes: @@ -6167,38 +6158,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: leal (,%rcx,8), %eax +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: leal (,%rsi,8), %eax ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: andb $24, %sil +; FALLBACK6-NEXT: negb %sil +; FALLBACK6-NEXT: movsbq %sil, %rsi +; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK6-NEXT: shrq %rsi +; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK6-NEXT: orq %r9, %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: movq %rsi, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes: @@ -6308,36 +6299,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-LABEL: shl_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andb $24, %sil +; FALLBACK10-NEXT: negb %sil +; FALLBACK10-NEXT: movsbq %sil, %rsi +; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK10-NEXT: shrq %rsi +; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK10-NEXT: orq %r9, %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: movq %rsi, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -6446,36 +6437,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-LABEL: shl_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: leal (,%rsi,8), %eax ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: andb $24, %sil +; FALLBACK14-NEXT: negb %sil +; FALLBACK14-NEXT: movsbq %sil, %rsi +; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK14-NEXT: shrq %rsi +; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK14-NEXT: orq %r9, %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: movq %rsi, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -6745,71 +6736,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, %eax +; FALLBACK18-NEXT: movl %eax, %ebp ; FALLBACK18-NEXT: andb $28, %bl ; FALLBACK18-NEXT: negb %bl ; FALLBACK18-NEXT: movsbl %bl, %esi ; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %eax, %edi -; FALLBACK18-NEXT: movl %edx, %ecx -; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: movl 68(%esp,%esi), %ecx +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %ecx, %edi +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: shrxl %edx, %ebx, %ebx ; FALLBACK18-NEXT: orl %edi, %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, %edi ; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax +; FALLBACK18-NEXT: shrxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp +; FALLBACK18-NEXT: movl %ebp, %esi +; FALLBACK18-NEXT: shlxl %ebp, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebx +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %edx, %ecx, %ecx +; FALLBACK18-NEXT: orl %ebx, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK18-NEXT: movl 80(%esp,%ebp), %ecx +; FALLBACK18-NEXT: movl %ecx, %ebx ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax -; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp +; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %esi, %ecx, %ecx +; FALLBACK18-NEXT: movl %esi, %eax ; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp -; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: shrxl %edx, %edi, %edi +; FALLBACK18-NEXT: orl %ecx, %edi +; FALLBACK18-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: shlxl %esi, 92(%esp,%ecx), %ebp +; FALLBACK18-NEXT: movl 88(%esp,%ecx), %esi +; FALLBACK18-NEXT: shlxl %eax, %esi, %ecx ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK18-NEXT: shrxl %edx, %esi, %esi ; FALLBACK18-NEXT: orl %ebp, %esi ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: movl %ecx, (%edx) +; FALLBACK18-NEXT: movl %eax, 24(%edx) +; FALLBACK18-NEXT: movl %esi, 28(%edx) +; FALLBACK18-NEXT: movl %edi, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edx) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -7085,78 +7080,76 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: shlb $3, %cl ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %edx -; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, %esi -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %edi +; FALLBACK22-NEXT: movl %ecx, %ebx +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: negb %dl +; FALLBACK22-NEXT: movsbl %dl, %edx +; FALLBACK22-NEXT: movl 84(%esp,%edx), %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK22-NEXT: notb %cl +; FALLBACK22-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %esi, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK22-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK22-NEXT: movl %esi, %edi ; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: orl %ecx, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %ebp, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK22-NEXT: movl %edi, %ebp +; FALLBACK22-NEXT: shrl %ebp +; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ebp +; FALLBACK22-NEXT: orl %esi, %ebp +; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK22-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK22-NEXT: movl %ebp, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: orl %ebp, %edi ; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %edi, %edx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK22-NEXT: movl %edi, (%esi) -; FALLBACK22-NEXT: movl %edx, 28(%esi) -; FALLBACK22-NEXT: movl %eax, 24(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) +; FALLBACK22-NEXT: shrxl %ecx, %eax, %esi +; FALLBACK22-NEXT: movl 88(%esp,%edx), %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ecx, %eax, %eax +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movl %ebp, (%ecx) +; FALLBACK22-NEXT: movl %eax, 28(%ecx) +; FALLBACK22-NEXT: movl %esi, 24(%ecx) +; FALLBACK22-NEXT: movl %edi, 4(%ecx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%esi) +; FALLBACK22-NEXT: movl %eax, 12(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 16(%esi) +; FALLBACK22-NEXT: movl %eax, 16(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%esi) +; FALLBACK22-NEXT: movl %eax, 20(%ecx) ; FALLBACK22-NEXT: addl $108, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -7410,76 +7403,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %edx -; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi ; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, %esi -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %edi +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: negb %dl +; FALLBACK26-NEXT: movsbl %dl, %edx +; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %esi +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %esi, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK26-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK26-NEXT: movl %esi, %edi ; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ecx, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %ebp, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK26-NEXT: movl %edi, %ebp +; FALLBACK26-NEXT: shrl %ebp +; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK26-NEXT: orl %esi, %ebp +; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK26-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK26-NEXT: movl %ebp, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %esi +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %eax, %esi, %edi +; FALLBACK26-NEXT: orl %ebp, %edi +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %eax, %ecx, %esi +; FALLBACK26-NEXT: movl 88(%esp,%edx), %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebp +; FALLBACK26-NEXT: orl %ebp, %esi +; FALLBACK26-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %eax, %ecx, %eax +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: movl %ebp, (%ecx) +; FALLBACK26-NEXT: movl %eax, 28(%ecx) +; FALLBACK26-NEXT: movl %esi, 24(%ecx) +; FALLBACK26-NEXT: movl %edi, 4(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %edi, %edx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK26-NEXT: movl %edi, (%esi) -; FALLBACK26-NEXT: movl %edx, 28(%esi) -; FALLBACK26-NEXT: movl %eax, 24(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) +; FALLBACK26-NEXT: movl %eax, 8(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%esi) +; FALLBACK26-NEXT: movl %eax, 12(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 16(%esi) +; FALLBACK26-NEXT: movl %eax, 16(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%esi) +; FALLBACK26-NEXT: movl %eax, 20(%ecx) ; FALLBACK26-NEXT: addl $108, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -7732,76 +7723,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %edx -; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi ; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, %esi -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %edi +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: negb %dl +; FALLBACK30-NEXT: movsbl %dl, %edx +; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %esi +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %esi, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK30-NEXT: movl %esi, %edi ; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ecx, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi -; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %edi, %edx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK30-NEXT: movl %edi, (%esi) -; FALLBACK30-NEXT: movl %edx, 28(%esi) -; FALLBACK30-NEXT: movl %eax, 24(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %ebp, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK30-NEXT: movl %edi, %ebp +; FALLBACK30-NEXT: shrl %ebp +; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK30-NEXT: orl %esi, %ebp +; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK30-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK30-NEXT: movl %ebp, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %eax, %esi, %edi +; FALLBACK30-NEXT: orl %ebp, %edi +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %eax, %ecx, %esi +; FALLBACK30-NEXT: movl 88(%esp,%edx), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %eax, %ecx, %eax +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: movl %ebp, (%ecx) +; FALLBACK30-NEXT: movl %eax, 28(%ecx) +; FALLBACK30-NEXT: movl %esi, 24(%ecx) +; FALLBACK30-NEXT: movl %edi, 4(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%esi) +; FALLBACK30-NEXT: movl %eax, 12(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 16(%esi) +; FALLBACK30-NEXT: movl %eax, 16(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%esi) +; FALLBACK30-NEXT: movl %eax, 20(%ecx) ; FALLBACK30-NEXT: addl $108, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -7987,32 +7976,32 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: shlb $2, %sil ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movsbq %sil, %rdi +; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 +; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes_dwordOff: @@ -8135,40 +8124,40 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: movl %ecx, %eax +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: movl %esi, %eax ; FALLBACK6-NEXT: shlb $5, %al ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: shlb $2, %cl -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: shlb $2, %sil +; FALLBACK6-NEXT: andb $24, %sil +; FALLBACK6-NEXT: negb %sil +; FALLBACK6-NEXT: movsbq %sil, %rsi +; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK6-NEXT: shrq %rsi +; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK6-NEXT: orq %r9, %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: movq %rsi, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes_dwordOff: @@ -8283,38 +8272,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK10-LABEL: shl_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: movl %esi, %eax ; FALLBACK10-NEXT: shlb $5, %al ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: shlb $2, %cl -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: shlb $2, %sil +; FALLBACK10-NEXT: andb $24, %sil +; FALLBACK10-NEXT: negb %sil +; FALLBACK10-NEXT: movsbq %sil, %rsi +; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK10-NEXT: shrq %rsi +; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK10-NEXT: orq %r9, %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: movq %rsi, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -8428,38 +8417,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK14-LABEL: shl_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: movl %esi, %eax ; FALLBACK14-NEXT: shlb $5, %al ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: shlb $2, %cl -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: shlb $2, %sil +; FALLBACK14-NEXT: andb $24, %sil +; FALLBACK14-NEXT: negb %sil +; FALLBACK14-NEXT: movsbq %sil, %rsi +; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK14-NEXT: shrq %rsi +; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK14-NEXT: orq %r9, %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: movq %rsi, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -8906,30 +8895,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes: @@ -9067,30 +9056,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %ecx ; FALLBACK6-NEXT: andb $24, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movzbl %sil, %esi +; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes: @@ -9227,30 +9216,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movl %eax, %ecx ; FALLBACK10-NEXT: andb $24, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movzbl %sil, %esi +; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %rdi, %rax +; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes: @@ -9387,30 +9376,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andb $24, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movzbl %sil, %esi +; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %rdi, %rax +; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: movq %rcx, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes: @@ -9671,7 +9660,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK18-NEXT: movl (%esi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -9680,22 +9669,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl 8(%esi), %ebx ; FALLBACK18-NEXT: movl 12(%esi), %ebp ; FALLBACK18-NEXT: movl 16(%esi), %edi -; FALLBACK18-NEXT: movzbl (%ecx), %ecx -; FALLBACK18-NEXT: movl 20(%esi), %edx +; FALLBACK18-NEXT: movzbl (%edx), %edx +; FALLBACK18-NEXT: movl 20(%esi), %ecx ; FALLBACK18-NEXT: movl 24(%esi), %eax ; FALLBACK18-NEXT: movl 28(%esi), %esi ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl %edx, %ecx +; FALLBACK18-NEXT: shlb $3, %cl ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: sarl $31, %esi ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) @@ -9705,66 +9694,65 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $28, %cl -; FALLBACK18-NEXT: movzbl %cl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp -; FALLBACK18-NEXT: orl %ebx, %ebp -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%esi,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: andb $28, %dl +; FALLBACK18-NEXT: movzbl %dl, %esi +; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx +; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx +; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: sarxl %ebp, %esi, %esi +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK18-NEXT: movl %esi, 28(%edi) +; FALLBACK18-NEXT: movl %ecx, 24(%edi) +; FALLBACK18-NEXT: movl %eax, 16(%edi) +; FALLBACK18-NEXT: movl %edx, 20(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edi) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -10070,82 +10058,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movl 16(%ecx), %esi ; FALLBACK22-NEXT: movl 20(%ecx), %edi -; FALLBACK22-NEXT: movl 24(%ecx), %ebx -; FALLBACK22-NEXT: movl 28(%ecx), %edx -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl 24(%ecx), %ebp +; FALLBACK22-NEXT: movl 28(%ecx), %ecx +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: shlb $3, %bl +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: sarl $31, %edx -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %eax, %edx -; FALLBACK22-NEXT: notb %dl -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: sarl $31, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ebx, %eax +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: movzbl %dl, %ecx +; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK22-NEXT: movl %eax, %ebp +; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: sarxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edx +; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK22-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK22-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK22-NEXT: movl %ebp, %edx +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebp +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK22-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK22-NEXT: shrxl %edx, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %eax, %esi ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: sarxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, 28(%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %esi, 24(%edx) +; FALLBACK22-NEXT: movl %edi, 16(%edx) +; FALLBACK22-NEXT: movl %ebp, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -10446,82 +10434,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK26-NEXT: movl 16(%ecx), %esi ; FALLBACK26-NEXT: movl 20(%ecx), %edi -; FALLBACK26-NEXT: movl 24(%ecx), %ebx -; FALLBACK26-NEXT: movl 28(%ecx), %edx -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl 24(%ecx), %ebp +; FALLBACK26-NEXT: movl 28(%ecx), %ecx +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: shlb $3, %bl +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: sarl $31, %edx -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %eax, %edx -; FALLBACK26-NEXT: notb %dl -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: sarl $31, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ebx, %eax +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: movzbl %dl, %ecx +; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK26-NEXT: movl %eax, %ebp +; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: sarxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edx +; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK26-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK26-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK26-NEXT: movl %ebp, %edx +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebp +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK26-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %eax, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: sarxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl %ecx, 28(%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %esi, 24(%edx) +; FALLBACK26-NEXT: movl %edi, 16(%edx) +; FALLBACK26-NEXT: movl %ebp, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -10822,82 +10810,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK30-NEXT: movl 16(%ecx), %esi ; FALLBACK30-NEXT: movl 20(%ecx), %edi -; FALLBACK30-NEXT: movl 24(%ecx), %ebx -; FALLBACK30-NEXT: movl 28(%ecx), %edx -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl 24(%ecx), %ebp +; FALLBACK30-NEXT: movl 28(%ecx), %ecx +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: shlb $3, %bl +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: sarl $31, %edx -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %eax, %edx -; FALLBACK30-NEXT: notb %dl -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: sarl $31, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ebx, %eax +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: movzbl %dl, %ecx +; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK30-NEXT: movl %eax, %ebp +; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: sarxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edx +; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK30-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK30-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK30-NEXT: movl %ebp, %edx +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebp +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK30-NEXT: shrxl %edx, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK30-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %eax, %esi ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: sarxl %ebx, %ecx, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, 28(%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %esi, 24(%edx) +; FALLBACK30-NEXT: movl %edi, 16(%edx) +; FALLBACK30-NEXT: movl %ebp, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -11104,30 +11092,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes_dwordOff: @@ -11268,30 +11256,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %ecx ; FALLBACK6-NEXT: andb $6, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movzbl %sil, %esi +; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes_dwordOff: @@ -11431,30 +11419,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movl %eax, %ecx ; FALLBACK10-NEXT: andb $6, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movzbl %sil, %esi +; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %rdi, %rax +; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes_dwordOff: @@ -11594,30 +11582,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andb $6, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movzbl %sil, %esi +; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %rdi, %rax +; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: movq %rcx, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes_dwordOff: @@ -12204,10 +12192,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: lshr_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -12235,60 +12221,58 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: movl %ecx, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 +; FALLBACK2-NEXT: notb %cl +; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK2-NEXT: orq %r10, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 +; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 +; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 +; FALLBACK2-NEXT: orq %rbx, %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK2-NEXT: orq %rbx, %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 +; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx +; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK2-NEXT: movq %rax, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 48(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r12, 40(%rdx) +; FALLBACK2-NEXT: movq %r11, 16(%rdx) +; FALLBACK2-NEXT: movq %r9, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_64bytes: @@ -12512,13 +12496,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: lshr_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 @@ -12533,62 +12515,60 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: leal (,%rax,8), %ecx +; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: movl %ecx, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK6-NEXT: notb %cl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r8, %rdi +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK6-NEXT: addq %r11, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK6-NEXT: orq %rbx, %r11 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK6-NEXT: orq %r15, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK6-NEXT: addq %rbx, %rbx +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK6-NEXT: orq %r14, %rbx +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK6-NEXT: orq %r14, %r15 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK6-NEXT: orq %r10, %rcx +; FALLBACK6-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK6-NEXT: movq %rax, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: movq %r15, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 32(%rdx) +; FALLBACK6-NEXT: movq %r13, 40(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_64bytes: @@ -12749,43 +12729,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: pushq %rbx ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax +; FALLBACK9-NEXT: movl (%rsi), %edi ; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: leal (,%rdi,8), %ecx ; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK9-NEXT: andl $56, %edi +; FALLBACK9-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK9-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq %r9, %rax +; FALLBACK9-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK9-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK9-NEXT: movq %r10, %r8 ; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK9-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: movq %r11, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK9-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK9-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK9-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK9-NEXT: movq %rdi, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) ; FALLBACK9-NEXT: movq %r9, 48(%rdx) ; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) +; FALLBACK9-NEXT: movq %rsi, 32(%rdx) ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %rax, 24(%rdx) ; FALLBACK9-NEXT: movq %r14, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 @@ -12795,77 +12775,73 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: lshr_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: movl (%rsi), %esi ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 -; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp -; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) -; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -120(%rsp,%rsi), %r10 +; FALLBACK10-NEXT: movq -112(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %r8, %rdi +; FALLBACK10-NEXT: movq -104(%rsp,%rsi), %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r11, %rbx +; FALLBACK10-NEXT: movq -96(%rsp,%rsi), %r14 +; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK10-NEXT: shlxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rbx +; FALLBACK10-NEXT: addq %r11, %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rbx, %r11 +; FALLBACK10-NEXT: movq -88(%rsp,%rsi), %rbx +; FALLBACK10-NEXT: shrxq %rcx, %rbx, %r15 +; FALLBACK10-NEXT: movq -80(%rsp,%rsi), %r12 +; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK10-NEXT: shlxq %rax, %r13, %r13 +; FALLBACK10-NEXT: orq %r15, %r13 +; FALLBACK10-NEXT: shrxq %rcx, %r14, %r14 +; FALLBACK10-NEXT: addq %rbx, %rbx +; FALLBACK10-NEXT: shlxq %rax, %rbx, %rbx +; FALLBACK10-NEXT: orq %r14, %rbx +; FALLBACK10-NEXT: shrxq %rcx, %r12, %r14 +; FALLBACK10-NEXT: movq -72(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r15 +; FALLBACK10-NEXT: shlxq %rax, %r15, %r15 +; FALLBACK10-NEXT: orq %r14, %r15 +; FALLBACK10-NEXT: shrxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 56(%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %r15, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 32(%rdx) +; FALLBACK10-NEXT: movq %r13, 40(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -12930,45 +12906,45 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: pushq %rax ; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %r9d +; FALLBACK12-NEXT: movl (%rsi), %r10d ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%r9,8), %eax +; FALLBACK12-NEXT: leal (,%r10,8), %eax ; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %r9d -; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 +; FALLBACK12-NEXT: andl $56, %r10d +; FALLBACK12-NEXT: movq -128(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq -120(%rsp,%r10), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx +; FALLBACK12-NEXT: orq %r9, %rdi +; FALLBACK12-NEXT: movq -104(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq %r9, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 +; FALLBACK12-NEXT: movq -96(%rsp,%r10), %r12 ; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 ; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx +; FALLBACK12-NEXT: movq -112(%rsp,%r10), %rbx ; FALLBACK12-NEXT: movq %rbx, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: addq %r9, %r9 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: orq %r14, %r9 +; FALLBACK12-NEXT: movq -88(%rsp,%r10), %r14 ; FALLBACK12-NEXT: movq %r14, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp +; FALLBACK12-NEXT: movq -80(%rsp,%r10), %rbp ; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r15 @@ -12981,8 +12957,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: orq %r12, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 +; FALLBACK12-NEXT: movq -72(%rsp,%r10), %r10 +; FALLBACK12-NEXT: leaq (%r10,%r10), %r12 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: orq %rbp, %r12 @@ -12993,13 +12969,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r8, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 56(%rdx) +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movq %r10, 56(%rdx) ; FALLBACK12-NEXT: movq %rbx, 8(%rdx) ; FALLBACK12-NEXT: movq %r12, 48(%rdx) ; FALLBACK12-NEXT: movq %r14, 32(%rdx) ; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) +; FALLBACK12-NEXT: movq %r9, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: addq $8, %rsp @@ -13062,74 +13038,70 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: lshr_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK14-NEXT: movl (%rsi), %esi ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rsi,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: leal (,%rsi,8), %eax +; FALLBACK14-NEXT: andl $56, %eax +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andl $56, %esi -; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax -; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10 -; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15 -; FALLBACK14-NEXT: movl %ecx, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %r10 +; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %r8, %rdi +; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r11, %rbx +; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r14 +; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK14-NEXT: shlxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rbx +; FALLBACK14-NEXT: addq %r11, %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rbx, %r11 +; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %rbx +; FALLBACK14-NEXT: shrxq %rcx, %rbx, %r15 ; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 -; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp +; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK14-NEXT: shlxq %rax, %r13, %r13 +; FALLBACK14-NEXT: orq %r15, %r13 +; FALLBACK14-NEXT: shrxq %rcx, %r14, %r14 +; FALLBACK14-NEXT: addq %rbx, %rbx +; FALLBACK14-NEXT: shlxq %rax, %rbx, %rbx +; FALLBACK14-NEXT: orq %r14, %rbx +; FALLBACK14-NEXT: shrxq %rcx, %r12, %r14 ; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r15 +; FALLBACK14-NEXT: shlxq %rax, %r15, %r15 +; FALLBACK14-NEXT: orq %r14, %r15 +; FALLBACK14-NEXT: shrxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rsi, %rsi -; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi -; FALLBACK14-NEXT: orq %r13, %rsi -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %rbp, %rax ; FALLBACK14-NEXT: movq %rcx, 56(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rsi, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: movq %r15, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 32(%rdx) +; FALLBACK14-NEXT: movq %r13, 40(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -13139,40 +13111,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx ; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax +; FALLBACK15-NEXT: movl (%rsi), %edi ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: leal (,%rdi,8), %ecx ; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK15-NEXT: andl $56, %edi +; FALLBACK15-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK15-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq %r9, %rax +; FALLBACK15-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK15-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK15-NEXT: movq %r10, %r8 ; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK15-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: movq %r11, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK15-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK15-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK15-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK15-NEXT: movq %rdi, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) +; FALLBACK15-NEXT: movq %rsi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r14, (%rdx) ; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx @@ -13618,14 +13590,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 36(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebp -; FALLBACK18-NEXT: movl 44(%eax), %ebx +; FALLBACK18-NEXT: movl 40(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 44(%eax), %ebp ; FALLBACK18-NEXT: movl 48(%eax), %edi ; FALLBACK18-NEXT: movl 52(%eax), %esi ; FALLBACK18-NEXT: movl 56(%eax), %edx ; FALLBACK18-NEXT: movl 60(%eax), %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %eax +; FALLBACK18-NEXT: movl (%eax), %ebx ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) @@ -13634,136 +13607,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: leal (,%eax,8), %edx +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: leal (,%ebx,8), %edx ; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: andl $60, %ecx -; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %edx, %ecx +; FALLBACK18-NEXT: andl $60, %ebx +; FALLBACK18-NEXT: movl 68(%esp,%ebx), %esi +; FALLBACK18-NEXT: movl 72(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %edi -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%eax,%eax), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 76(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 88(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 92(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 104(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 100(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 112(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ecx, %ebp ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %esi, %ecx +; FALLBACK18-NEXT: orl %eax, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 116(%esp,%ebx), %eax +; FALLBACK18-NEXT: movl %ebp, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebp +; FALLBACK18-NEXT: orl %ebp, %esi +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %ecx, %ebp ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: movl 124(%esp,%ebx), %eax +; FALLBACK18-NEXT: leal (%eax,%eax), %ebx +; FALLBACK18-NEXT: shlxl %edx, %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: shrxl %ebp, %eax, %edi ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl %edi, 60(%eax) +; FALLBACK18-NEXT: movl %edx, 56(%eax) +; FALLBACK18-NEXT: movl %ecx, 48(%eax) +; FALLBACK18-NEXT: movl %esi, 52(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -14284,7 +14259,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK22-NEXT: movl (%eax), %ecx +; FALLBACK22-NEXT: movl (%eax), %ebx ; FALLBACK22-NEXT: xorps %xmm4, %xmm4 ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) @@ -14294,112 +14269,114 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%ecx,8), %edx +; FALLBACK22-NEXT: leal (,%ebx,8), %edx ; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: andl $60, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: andl $60, %ebx +; FALLBACK22-NEXT: movl 68(%esp,%ebx), %esi +; FALLBACK22-NEXT: movl 72(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %edi -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: notb %dl ; FALLBACK22-NEXT: leal (%eax,%eax), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 76(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 88(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 84(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 92(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 104(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 100(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx -; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi +; FALLBACK22-NEXT: movl 112(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ecx, %ebp ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, %esi, %ecx +; FALLBACK22-NEXT: orl %eax, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK22-NEXT: leal (%edi,%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 116(%esp,%ebx), %eax +; FALLBACK22-NEXT: movl %ebp, %ecx +; FALLBACK22-NEXT: shrxl %ebp, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %ebp ; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 124(%esp,%ebx), %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %ebx +; FALLBACK22-NEXT: shlxl %edx, %ebx, %edx +; FALLBACK22-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK22-NEXT: orl %edi, %edx +; FALLBACK22-NEXT: shrxl %ebp, %eax, %edi ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl %edi, 60(%eax) +; FALLBACK22-NEXT: movl %edx, 56(%eax) +; FALLBACK22-NEXT: movl %ecx, 48(%eax) +; FALLBACK22-NEXT: movl %esi, 52(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -14873,109 +14850,107 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: leal (,%ecx,8), %edx ; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %edi -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: notb %dl ; FALLBACK26-NEXT: leal (%eax,%eax), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: shlxl %edx, %ebp, %ebp ; FALLBACK26-NEXT: orl %edi, %ebp ; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %esi ; FALLBACK26-NEXT: orl %edi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: orl %eax, %ebp +; FALLBACK26-NEXT: movl 120(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %edx, %eax, %esi ; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: shrxl %ebx, %eax, %edi ; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK26-NEXT: shlxl %edx, %eax, %eax +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edx +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %edi, %edx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %edi ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: movl %edx, 60(%ecx) -; FALLBACK26-NEXT: movl %ebx, 56(%ecx) -; FALLBACK26-NEXT: movl %edi, 48(%ecx) +; FALLBACK26-NEXT: movl %edi, 60(%ecx) +; FALLBACK26-NEXT: movl %edx, 56(%ecx) +; FALLBACK26-NEXT: movl %eax, 48(%ecx) ; FALLBACK26-NEXT: movl %esi, 52(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 40(%ecx) +; FALLBACK26-NEXT: movl %ebp, 40(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 44(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -15430,115 +15405,113 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK30-NEXT: movl (%eax), %edx +; FALLBACK30-NEXT: movl (%eax), %ecx ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%edx,8), %ecx -; FALLBACK30-NEXT: andl $24, %ecx -; FALLBACK30-NEXT: andl $60, %edx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi -; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax +; FALLBACK30-NEXT: leal (,%ecx,8), %edx +; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: andl $60, %ecx +; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK30-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK30-NEXT: movl %ecx, %ebx -; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: notb %dl ; FALLBACK30-NEXT: leal (%eax,%eax), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: shlxl %edx, %ebp, %ebp ; FALLBACK30-NEXT: orl %edi, %ebp ; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi +; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: shlxl %edx, %esi, %esi ; FALLBACK30-NEXT: orl %edi, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi -; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax -; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi +; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: orl %eax, %ebp +; FALLBACK30-NEXT: movl 120(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %edx, %eax, %esi +; FALLBACK30-NEXT: movl 116(%esp,%ecx), %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %edi ; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax -; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx -; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp -; FALLBACK30-NEXT: leal (%edx,%edx), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx -; FALLBACK30-NEXT: orl %eax, %edx +; FALLBACK30-NEXT: shlxl %edx, %eax, %eax +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl 124(%esp,%ecx), %ecx +; FALLBACK30-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edx +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %edi, %edx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %edi ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: movl %ebp, 60(%ecx) +; FALLBACK30-NEXT: movl %edi, 60(%ecx) ; FALLBACK30-NEXT: movl %edx, 56(%ecx) -; FALLBACK30-NEXT: movl %edi, 48(%ecx) +; FALLBACK30-NEXT: movl %eax, 48(%ecx) ; FALLBACK30-NEXT: movl %esi, 52(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 40(%ecx) +; FALLBACK30-NEXT: movl %ebp, 40(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 44(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -16196,10 +16169,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: shl_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -16227,62 +16198,60 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rsi,8), %eax ; FALLBACK2-NEXT: andl $56, %eax +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andl $56, %esi ; FALLBACK2-NEXT: negl %esi ; FALLBACK2-NEXT: movslq %esi, %rsi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10 -; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9 -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14 -; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shlxq %rax, %r8, %r15 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r12 -; FALLBACK2-NEXT: movl %eax, %r13d -; FALLBACK2-NEXT: notb %r13b -; FALLBACK2-NEXT: shrq %r10 -; FALLBACK2-NEXT: shrxq %r13, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp -; FALLBACK2-NEXT: shrq %r14 -; FALLBACK2-NEXT: shrxq %r13, %r14, %r14 -; FALLBACK2-NEXT: orq %r11, %r14 -; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx -; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r9 +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %r8 +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r10 ; FALLBACK2-NEXT: shrq %r9 -; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 -; FALLBACK2-NEXT: orq %r15, %r9 +; FALLBACK2-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK2-NEXT: orq %r8, %r9 +; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %rbx +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r14 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %rbx, %r8 ; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi -; FALLBACK2-NEXT: orq %rbp, %rdi +; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r14, %rdi +; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rbx +; FALLBACK2-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r15 +; FALLBACK2-NEXT: shlxq %rcx, %r15, %r12 +; FALLBACK2-NEXT: shrq %r15 +; FALLBACK2-NEXT: shrxq %rax, %r15, %r15 +; FALLBACK2-NEXT: orq %r14, %r15 +; FALLBACK2-NEXT: shrq %r11 +; FALLBACK2-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK2-NEXT: orq %r12, %r11 +; FALLBACK2-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 -; FALLBACK2-NEXT: orq %rax, %r8 -; FALLBACK2-NEXT: movq %r12, (%rdx) -; FALLBACK2-NEXT: movq %r8, 48(%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r14, %rsi +; FALLBACK2-NEXT: shrq %rbx +; FALLBACK2-NEXT: shrxq %rax, %rbx, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) +; FALLBACK2-NEXT: movq %rax, 48(%rdx) ; FALLBACK2-NEXT: movq %rsi, 56(%rdx) -; FALLBACK2-NEXT: movq %rdi, 32(%rdx) -; FALLBACK2-NEXT: movq %r9, 40(%rdx) -; FALLBACK2-NEXT: movq %rcx, 16(%rdx) -; FALLBACK2-NEXT: movq %r14, 24(%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) +; FALLBACK2-NEXT: movq %r11, 32(%rdx) +; FALLBACK2-NEXT: movq %r15, 40(%rdx) +; FALLBACK2-NEXT: movq %rdi, 16(%rdx) +; FALLBACK2-NEXT: movq %r8, 24(%rdx) +; FALLBACK2-NEXT: movq %r9, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_64bytes: @@ -16509,86 +16478,81 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: shl_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: subq $24, %rsp +; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK6-NEXT: movl (%rsi), %eax +; FALLBACK6-NEXT: movl (%rsi), %esi ; FALLBACK6-NEXT: xorps %xmm4, %xmm4 ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm3, (%rsp) +; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %ecx -; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: leal (,%rsi,8), %eax ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: negl %eax -; FALLBACK6-NEXT: movslq %eax, %rsi -; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK6-NEXT: movl %ecx, %r9d -; FALLBACK6-NEXT: notb %r9b +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: negl %esi +; FALLBACK6-NEXT: movslq %esi, %rsi +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %r9, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK6-NEXT: shrq %r9 +; FALLBACK6-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK6-NEXT: orq %r10, %r9 +; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK6-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK6-NEXT: shrq %r10 +; FALLBACK6-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %r11, %r10 +; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK6-NEXT: shrq %r11 +; FALLBACK6-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %r14, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK6-NEXT: shrq %rbx +; FALLBACK6-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK6-NEXT: orq %r15, %rbx +; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %r12, %rdi -; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK6-NEXT: shrq %r13 -; FALLBACK6-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK6-NEXT: orq %r15, %r12 -; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK6-NEXT: shrq %r11 -; FALLBACK6-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK6-NEXT: shrq %r14 -; FALLBACK6-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK6-NEXT: orq %r10, %r14 -; FALLBACK6-NEXT: shrq %rsi -; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK6-NEXT: orq %rbx, %rsi -; FALLBACK6-NEXT: shrq %rax -; FALLBACK6-NEXT: shrxq %r9, %rax, %rax -; FALLBACK6-NEXT: orq %r8, %rax -; FALLBACK6-NEXT: shrq %rbp -; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK6-NEXT: orq %r15, %r8 -; FALLBACK6-NEXT: movq %rcx, (%rdx) -; FALLBACK6-NEXT: movq %r8, 56(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %rsi, 8(%rdx) -; FALLBACK6-NEXT: movq %r14, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r12, 32(%rdx) -; FALLBACK6-NEXT: movq %rdi, 40(%rdx) -; FALLBACK6-NEXT: addq $24, %rsp +; FALLBACK6-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK6-NEXT: shrq %r15 +; FALLBACK6-NEXT: shrxq %rax, %r15, %rax +; FALLBACK6-NEXT: orq %rcx, %rax +; FALLBACK6-NEXT: movq %r14, (%rdx) +; FALLBACK6-NEXT: movq %rax, 56(%rdx) +; FALLBACK6-NEXT: movq %rdi, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 8(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, 24(%rdx) +; FALLBACK6-NEXT: movq %r9, 32(%rdx) +; FALLBACK6-NEXT: movq %r8, 40(%rdx) +; FALLBACK6-NEXT: addq $8, %rsp ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_64bytes: @@ -16798,80 +16762,75 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: shl_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: subq $24, %rsp +; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: movl (%rsi), %esi ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %ecx -; FALLBACK10-NEXT: andl $56, %ecx +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: negl %eax -; FALLBACK10-NEXT: movslq %eax, %rsi -; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK10-NEXT: movl %ecx, %r9d -; FALLBACK10-NEXT: notb %r9b +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: negl %esi +; FALLBACK10-NEXT: movslq %esi, %rsi +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %r9, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK10-NEXT: shrq %r9 +; FALLBACK10-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK10-NEXT: orq %r10, %r9 +; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK10-NEXT: shrq %r10 +; FALLBACK10-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %r11, %r10 +; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK10-NEXT: shrq %r11 +; FALLBACK10-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %r14, %r11 +; FALLBACK10-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK10-NEXT: shrq %rbx +; FALLBACK10-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK10-NEXT: orq %r15, %rbx +; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK10-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %r12, %rdi -; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK10-NEXT: shrq %r13 -; FALLBACK10-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK10-NEXT: orq %r15, %r12 -; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK10-NEXT: shrq %r11 -; FALLBACK10-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK10-NEXT: shrq %r14 -; FALLBACK10-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK10-NEXT: orq %r10, %r14 -; FALLBACK10-NEXT: shrq %rsi -; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK10-NEXT: orq %rbx, %rsi -; FALLBACK10-NEXT: shrq %rax -; FALLBACK10-NEXT: shrxq %r9, %rax, %rax -; FALLBACK10-NEXT: orq %r8, %rax -; FALLBACK10-NEXT: shrq %rbp -; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK10-NEXT: orq %r15, %r8 -; FALLBACK10-NEXT: movq %rcx, (%rdx) -; FALLBACK10-NEXT: movq %r8, 56(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %rsi, 8(%rdx) -; FALLBACK10-NEXT: movq %r14, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r12, 32(%rdx) -; FALLBACK10-NEXT: movq %rdi, 40(%rdx) -; FALLBACK10-NEXT: addq $24, %rsp +; FALLBACK10-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK10-NEXT: shrq %r15 +; FALLBACK10-NEXT: shrxq %rax, %r15, %rax +; FALLBACK10-NEXT: orq %rcx, %rax +; FALLBACK10-NEXT: movq %r14, (%rdx) +; FALLBACK10-NEXT: movq %rax, 56(%rdx) +; FALLBACK10-NEXT: movq %rdi, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, 24(%rdx) +; FALLBACK10-NEXT: movq %r9, 32(%rdx) +; FALLBACK10-NEXT: movq %r8, 40(%rdx) +; FALLBACK10-NEXT: addq $8, %rsp ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -17071,77 +17030,72 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: shl_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: subq $24, %rsp +; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK14-NEXT: movl (%rsi), %eax +; FALLBACK14-NEXT: movl (%rsi), %esi ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: leal (,%rsi,8), %eax ; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: negl %eax -; FALLBACK14-NEXT: movslq %eax, %rsi -; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK14-NEXT: movl %ecx, %r9d -; FALLBACK14-NEXT: notb %r9b +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: negl %esi +; FALLBACK14-NEXT: movslq %esi, %rsi +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %r9, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK14-NEXT: shrq %r9 +; FALLBACK14-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK14-NEXT: orq %r10, %r9 +; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK14-NEXT: shrq %r10 +; FALLBACK14-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %r11, %r10 +; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK14-NEXT: shrq %r11 +; FALLBACK14-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %r14, %r11 +; FALLBACK14-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK14-NEXT: shrq %rbx +; FALLBACK14-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK14-NEXT: orq %r15, %rbx +; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK14-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %r12, %rdi -; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK14-NEXT: shrq %r13 -; FALLBACK14-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK14-NEXT: orq %r15, %r12 -; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: shrq %r11 -; FALLBACK14-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK14-NEXT: shrq %r14 -; FALLBACK14-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK14-NEXT: orq %r10, %r14 -; FALLBACK14-NEXT: shrq %rsi -; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK14-NEXT: orq %rbx, %rsi -; FALLBACK14-NEXT: shrq %rax -; FALLBACK14-NEXT: shrxq %r9, %rax, %rax -; FALLBACK14-NEXT: orq %r8, %rax -; FALLBACK14-NEXT: shrq %rbp -; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK14-NEXT: orq %r15, %r8 -; FALLBACK14-NEXT: movq %rcx, (%rdx) -; FALLBACK14-NEXT: movq %r8, 56(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %rsi, 8(%rdx) -; FALLBACK14-NEXT: movq %r14, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r12, 32(%rdx) -; FALLBACK14-NEXT: movq %rdi, 40(%rdx) -; FALLBACK14-NEXT: addq $24, %rsp +; FALLBACK14-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK14-NEXT: shrq %r15 +; FALLBACK14-NEXT: shrxq %rax, %r15, %rax +; FALLBACK14-NEXT: orq %rcx, %rax +; FALLBACK14-NEXT: movq %r14, (%rdx) +; FALLBACK14-NEXT: movq %rax, 56(%rdx) +; FALLBACK14-NEXT: movq %rdi, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, 24(%rdx) +; FALLBACK14-NEXT: movq %r9, 32(%rdx) +; FALLBACK14-NEXT: movq %r8, 40(%rdx) +; FALLBACK14-NEXT: addq $8, %rsp ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -17681,144 +17635,149 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: leal (,%ebp,8), %edx -; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: leal (,%ebp,8), %ebx +; FALLBACK18-NEXT: andl $24, %ebx +; FALLBACK18-NEXT: movl %ebx, %eax ; FALLBACK18-NEXT: andl $60, %ebp ; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK18-NEXT: subl %ebp, %edi -; FALLBACK18-NEXT: movl (%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: subl %ebp, %edx +; FALLBACK18-NEXT: movl (%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%edx), %ecx ; FALLBACK18-NEXT: notb %bl -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK18-NEXT: orl %ecx, %esi +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK18-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: orl %esi, %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%edx), %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%edi), %esi -; FALLBACK18-NEXT: movl %esi, %ecx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 12(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 20(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 12(%edx), %esi +; FALLBACK18-NEXT: movl %ebp, %edi +; FALLBACK18-NEXT: shlxl %ebp, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK18-NEXT: orl %eax, %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%edi), %ecx +; FALLBACK18-NEXT: movl 16(%edx), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 28(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: movl 20(%edx), %ecx +; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%edi), %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 36(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: movl 24(%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 28(%edx), %esi +; FALLBACK18-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%edi), %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 32(%edx), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 44(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: movl 36(%edx), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %edi, %eax ; FALLBACK18-NEXT: shrl %esi ; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%edi), %esi +; FALLBACK18-NEXT: orl %ebp, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 40(%edx), %edi +; FALLBACK18-NEXT: movl %edi, %esi ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 52(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK18-NEXT: movl 44(%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %edi, %edi +; FALLBACK18-NEXT: movl %eax, %esi +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: shrl %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK18-NEXT: orl %eax, %ebp -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl 48(%edx), %ebp +; FALLBACK18-NEXT: movl %ebp, %edi +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 52(%edx), %ecx +; FALLBACK18-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK18-NEXT: movl %esi, %ebp ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: negl %eax -; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK18-NEXT: movl 56(%edi), %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %edx -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %edx, %esi ; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, (%eax) -; FALLBACK18-NEXT: movl %esi, 56(%eax) -; FALLBACK18-NEXT: movl %ecx, 60(%eax) -; FALLBACK18-NEXT: movl %ebp, 48(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 40(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 44(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 32(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 36(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 24(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 28(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK18-NEXT: orl %edi, %esi +; FALLBACK18-NEXT: movl 56(%edx), %edi +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK18-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK18-NEXT: negl %ebx +; FALLBACK18-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK18-NEXT: orl %ecx, %ebx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: movl %edi, (%edx) +; FALLBACK18-NEXT: movl %eax, 56(%edx) +; FALLBACK18-NEXT: movl %ebx, 60(%edx) +; FALLBACK18-NEXT: movl %esi, 48(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 52(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 40(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 44(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 32(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 36(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 24(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 28(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edx) ; FALLBACK18-NEXT: addl $204, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -18342,144 +18301,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%eax,8), %edx -; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: leal (,%eax,8), %ebx +; FALLBACK22-NEXT: andl $24, %ebx +; FALLBACK22-NEXT: movl %ebx, %ecx ; FALLBACK22-NEXT: andl $60, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK22-NEXT: subl %eax, %edi -; FALLBACK22-NEXT: movl (%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 4(%edi), %eax +; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: subl %eax, %edx +; FALLBACK22-NEXT: movl (%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 4(%edx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK22-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 8(%edx), %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 8(%edi), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 12(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 16(%edi), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 20(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 12(%edx), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ecx, %edi +; FALLBACK22-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: orl %eax, %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 24(%edi), %ecx +; FALLBACK22-NEXT: movl 16(%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 28(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: movl 20(%edx), %ecx +; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 32(%edi), %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 36(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: movl 24(%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 28(%edx), %esi +; FALLBACK22-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 40(%edi), %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 32(%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 44(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: movl 36(%edx), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, %eax ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%edi), %esi +; FALLBACK22-NEXT: orl %ebp, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 40(%edx), %edi +; FALLBACK22-NEXT: movl %edi, %esi ; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK22-NEXT: movl 52(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK22-NEXT: movl 44(%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %edi, %edi +; FALLBACK22-NEXT: movl %eax, %esi +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK22-NEXT: orl %eax, %ebp -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 48(%edx), %ebp +; FALLBACK22-NEXT: movl %ebp, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 52(%edx), %ecx +; FALLBACK22-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK22-NEXT: movl %esi, %ebp ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: negl %eax -; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK22-NEXT: movl 56(%edi), %eax -; FALLBACK22-NEXT: shlxl %edx, %eax, %edx -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edx, %esi ; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK22-NEXT: movl %edx, (%eax) -; FALLBACK22-NEXT: movl %esi, 56(%eax) -; FALLBACK22-NEXT: movl %ecx, 60(%eax) -; FALLBACK22-NEXT: movl %ebp, 48(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 40(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 44(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 32(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 36(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 24(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 28(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 16(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 20(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 8(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 12(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 4(%eax) +; FALLBACK22-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl 56(%edx), %edi +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK22-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK22-NEXT: negl %ebx +; FALLBACK22-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK22-NEXT: orl %ecx, %ebx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %edi, (%edx) +; FALLBACK22-NEXT: movl %eax, 56(%edx) +; FALLBACK22-NEXT: movl %ebx, 60(%edx) +; FALLBACK22-NEXT: movl %esi, 48(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 52(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 40(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 44(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 32(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 36(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 24(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 28(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 16(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 12(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 4(%edx) ; FALLBACK22-NEXT: addl $204, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -18943,144 +18908,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: leal (,%eax,8), %edx -; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: leal (,%eax,8), %ebx +; FALLBACK26-NEXT: andl $24, %ebx +; FALLBACK26-NEXT: movl %ebx, %ecx ; FALLBACK26-NEXT: andl $60, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK26-NEXT: subl %eax, %edi -; FALLBACK26-NEXT: movl (%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 4(%edi), %eax +; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: subl %eax, %edx +; FALLBACK26-NEXT: movl (%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 4(%edx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK26-NEXT: orl %ecx, %esi +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 8(%edx), %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 8(%edi), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 12(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 16(%edi), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 20(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 12(%edx), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %ecx, %edi +; FALLBACK26-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: orl %eax, %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 24(%edi), %ecx +; FALLBACK26-NEXT: movl 16(%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 28(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: movl 20(%edx), %ecx +; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 32(%edi), %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 36(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: movl 24(%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 28(%edx), %esi +; FALLBACK26-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 40(%edi), %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 32(%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 44(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: movl 36(%edx), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, %eax ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%edi), %esi +; FALLBACK26-NEXT: orl %ebp, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 40(%edx), %edi +; FALLBACK26-NEXT: movl %edi, %esi ; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl 52(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK26-NEXT: movl 44(%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %edi, %edi +; FALLBACK26-NEXT: movl %eax, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: shrl %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK26-NEXT: orl %eax, %ebp -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl 48(%edx), %ebp +; FALLBACK26-NEXT: movl %ebp, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 52(%edx), %ecx +; FALLBACK26-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK26-NEXT: movl %esi, %ebp ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: negl %eax -; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK26-NEXT: movl 56(%edi), %eax -; FALLBACK26-NEXT: shlxl %edx, %eax, %edx -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edx, %esi ; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK26-NEXT: movl %edx, (%eax) -; FALLBACK26-NEXT: movl %esi, 56(%eax) -; FALLBACK26-NEXT: movl %ecx, 60(%eax) -; FALLBACK26-NEXT: movl %ebp, 48(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 40(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 44(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 32(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 36(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 24(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 28(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 16(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 20(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 8(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 12(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 4(%eax) +; FALLBACK26-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl 56(%edx), %edi +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK26-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK26-NEXT: negl %ebx +; FALLBACK26-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK26-NEXT: orl %ecx, %ebx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %edi, (%edx) +; FALLBACK26-NEXT: movl %eax, 56(%edx) +; FALLBACK26-NEXT: movl %ebx, 60(%edx) +; FALLBACK26-NEXT: movl %esi, 48(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 52(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 40(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 44(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 32(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 36(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 24(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 28(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 16(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 8(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 12(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 4(%edx) ; FALLBACK26-NEXT: addl $204, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -19531,144 +19502,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%eax,8), %edx -; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: leal (,%eax,8), %ebx +; FALLBACK30-NEXT: andl $24, %ebx +; FALLBACK30-NEXT: movl %ebx, %ecx ; FALLBACK30-NEXT: andl $60, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK30-NEXT: subl %eax, %edi -; FALLBACK30-NEXT: movl (%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 4(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK30-NEXT: orl %ecx, %esi +; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: subl %eax, %edx +; FALLBACK30-NEXT: movl (%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 8(%edi), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 12(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 16(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 20(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 24(%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 28(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl 4(%edx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 8(%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 32(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 36(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK30-NEXT: movl 12(%edx), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %ecx, %edi +; FALLBACK30-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: orl %eax, %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 40(%edi), %ecx +; FALLBACK30-NEXT: movl 16(%edx), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 44(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK30-NEXT: movl 20(%edx), %ecx +; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%edi), %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 24(%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 52(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: movl 28(%edx), %esi +; FALLBACK30-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK30-NEXT: orl %eax, %ebp -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: negl %eax -; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK30-NEXT: movl 56(%edi), %eax -; FALLBACK30-NEXT: shlxl %edx, %eax, %edx +; FALLBACK30-NEXT: movl 32(%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: movl 36(%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, %eax ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edx, %esi +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 40(%edx), %edi +; FALLBACK30-NEXT: movl %edi, %esi +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK30-NEXT: movl 44(%edx), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %edi, %edi +; FALLBACK30-NEXT: movl %eax, %esi +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: shrl %eax ; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK30-NEXT: movl %edx, (%eax) -; FALLBACK30-NEXT: movl %esi, 56(%eax) -; FALLBACK30-NEXT: movl %ecx, 60(%eax) -; FALLBACK30-NEXT: movl %ebp, 48(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 40(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 44(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 32(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 36(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 24(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 28(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 16(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 20(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 8(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 12(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 4(%eax) +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%edx), %ebp +; FALLBACK30-NEXT: movl %ebp, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 52(%edx), %ecx +; FALLBACK30-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK30-NEXT: movl %esi, %ebp +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: shrl %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl 56(%edx), %edi +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK30-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK30-NEXT: negl %ebx +; FALLBACK30-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK30-NEXT: orl %ecx, %ebx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %edi, (%edx) +; FALLBACK30-NEXT: movl %eax, 56(%edx) +; FALLBACK30-NEXT: movl %ebx, 60(%edx) +; FALLBACK30-NEXT: movl %esi, 48(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 52(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 40(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 44(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 32(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 36(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 24(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 28(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 16(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 12(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 4(%edx) ; FALLBACK30-NEXT: addl $204, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -20336,10 +20313,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: ashr_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -20371,60 +20346,58 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: movl %ecx, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 +; FALLBACK2-NEXT: notb %cl +; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK2-NEXT: orq %r10, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 +; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 +; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 +; FALLBACK2-NEXT: orq %rbx, %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK2-NEXT: orq %rbx, %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 +; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx +; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK2-NEXT: movq %rax, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 48(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r12, 40(%rdx) +; FALLBACK2-NEXT: movq %r11, 16(%rdx) +; FALLBACK2-NEXT: movq %r9, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_64bytes: @@ -20664,13 +20637,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: ashr_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 @@ -20691,62 +20662,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: leal (,%rax,8), %ecx +; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: movl %ecx, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK6-NEXT: notb %cl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r8, %rdi +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK6-NEXT: addq %r11, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK6-NEXT: orq %rbx, %r11 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK6-NEXT: orq %r15, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK6-NEXT: addq %rbx, %rbx +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK6-NEXT: orq %r14, %rbx +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK6-NEXT: orq %r14, %r15 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK6-NEXT: orq %r10, %rcx +; FALLBACK6-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK6-NEXT: movq %rax, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: movq %r15, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 32(%rdx) +; FALLBACK6-NEXT: movq %r13, 40(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_64bytes: @@ -20979,13 +20948,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: ashr_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK10-NEXT: movq 48(%rdi), %rcx @@ -21004,62 +20971,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: leal (,%rax,8), %ecx +; FALLBACK10-NEXT: andl $56, %ecx +; FALLBACK10-NEXT: movl %ecx, %esi ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 +; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK10-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK10-NEXT: orq %r8, %rdi +; FALLBACK10-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK10-NEXT: addq %r11, %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rbx, %r11 +; FALLBACK10-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK10-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK10-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK10-NEXT: orq %r15, %r13 +; FALLBACK10-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK10-NEXT: addq %rbx, %rbx +; FALLBACK10-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK10-NEXT: orq %r14, %rbx +; FALLBACK10-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) +; FALLBACK10-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK10-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK10-NEXT: orq %r14, %r15 +; FALLBACK10-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %r10, %rcx +; FALLBACK10-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 56(%rdx) ; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: movq %r15, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 32(%rdx) +; FALLBACK10-NEXT: movq %r13, 40(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -21292,13 +21257,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: ashr_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK14-NEXT: movq 48(%rdi), %rcx @@ -21317,62 +21280,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %esi -; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: leal (,%rax,8), %ecx +; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: movl %ecx, %esi ; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK14-NEXT: movl %esi, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK14-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK14-NEXT: orq %r8, %rdi +; FALLBACK14-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK14-NEXT: addq %r11, %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rbx, %r11 +; FALLBACK14-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK14-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK14-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK14-NEXT: orq %r15, %r13 +; FALLBACK14-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK14-NEXT: addq %rbx, %rbx +; FALLBACK14-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK14-NEXT: orq %r14, %rbx +; FALLBACK14-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %r13, %rax -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK14-NEXT: orq %rbp, %rcx -; FALLBACK14-NEXT: movq %rsi, 56(%rdx) +; FALLBACK14-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK14-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK14-NEXT: orq %r14, %r15 +; FALLBACK14-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %r10, %rcx +; FALLBACK14-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 56(%rdx) ; FALLBACK14-NEXT: movq %rcx, 8(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: movq %r15, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 32(%rdx) +; FALLBACK14-NEXT: movq %r13, 40(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -21960,111 +21921,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: leal (,%eax,8), %edx ; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: movl %edx, %ebx ; FALLBACK18-NEXT: andl $60, %ecx ; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%edi,%edi), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %ecx, %ebp +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK18-NEXT: orl %ebp, %esi +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK18-NEXT: leal (%eax,%eax), %ebp +; FALLBACK18-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK18-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl %edi, 60(%eax) +; FALLBACK18-NEXT: movl %edx, 56(%eax) +; FALLBACK18-NEXT: movl %ecx, 48(%eax) +; FALLBACK18-NEXT: movl %esi, 52(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -22664,111 +22626,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl %eax, %ecx ; FALLBACK22-NEXT: leal (,%eax,8), %edx ; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: andl $60, %ecx ; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: notb %dl ; FALLBACK22-NEXT: leal (%edi,%edi), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax ; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %ebp +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax ; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %ecx, %edi -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK22-NEXT: leal (%edi,%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %ebp +; FALLBACK22-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK22-NEXT: orl %edi, %edx +; FALLBACK22-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl %edi, 60(%eax) +; FALLBACK22-NEXT: movl %edx, 56(%eax) +; FALLBACK22-NEXT: movl %ecx, 48(%eax) +; FALLBACK22-NEXT: movl %esi, 52(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -23326,111 +23289,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl %eax, %ecx ; FALLBACK26-NEXT: leal (,%eax,8), %edx ; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: notb %dl ; FALLBACK26-NEXT: leal (%edi,%edi), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK26-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl %ecx, %ebp +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %ecx, %edi -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK26-NEXT: leal (%edi,%edi), %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK26-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK26-NEXT: orl %ebp, %esi +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK26-NEXT: addl %ebp, %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK26-NEXT: leal (%eax,%eax), %ebp +; FALLBACK26-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK26-NEXT: orl %edi, %edx +; FALLBACK26-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl %edx, 60(%eax) -; FALLBACK26-NEXT: movl %ebx, 56(%eax) -; FALLBACK26-NEXT: movl %edi, 48(%eax) -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl %esi, 40(%eax) +; FALLBACK26-NEXT: movl %edi, 60(%eax) +; FALLBACK26-NEXT: movl %edx, 56(%eax) +; FALLBACK26-NEXT: movl %ecx, 48(%eax) +; FALLBACK26-NEXT: movl %esi, 52(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 40(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 44(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -23988,111 +23952,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl %eax, %ecx ; FALLBACK30-NEXT: leal (,%eax,8), %edx ; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: movl %edx, %ebx ; FALLBACK30-NEXT: andl $60, %ecx ; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: notb %dl ; FALLBACK30-NEXT: leal (%edi,%edi), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK30-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax ; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl %ecx, %ebp +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax ; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %ecx, %edi -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK30-NEXT: shrxl %edx, %eax, %edi -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK30-NEXT: leal (%edi,%edi), %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK30-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK30-NEXT: addl %ebp, %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK30-NEXT: orl %eax, %ebx +; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK30-NEXT: leal (%eax,%eax), %ebp +; FALLBACK30-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK30-NEXT: orl %edi, %edx +; FALLBACK30-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl %edx, 60(%eax) -; FALLBACK30-NEXT: movl %ebx, 56(%eax) -; FALLBACK30-NEXT: movl %edi, 48(%eax) -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl %esi, 40(%eax) +; FALLBACK30-NEXT: movl %edi, 60(%eax) +; FALLBACK30-NEXT: movl %edx, 56(%eax) +; FALLBACK30-NEXT: movl %ecx, 48(%eax) +; FALLBACK30-NEXT: movl %esi, 52(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 40(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 44(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 338e104fbe8f0..221a51ed44696 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -712,33 +712,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -994,42 +994,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1297,33 +1297,33 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1487,31 +1487,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: @@ -1761,88 +1761,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2040,32 +2042,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; @@ -2319,97 +2321,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, 92(%esp,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2610,31 +2616,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: @@ -2927,60 +2933,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 32(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3263,13 +3268,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -3292,65 +3295,63 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: @@ -3868,20 +3869,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -3906,116 +3907,117 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4388,10 +4390,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax @@ -4419,63 +4419,61 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes: @@ -4972,33 +4970,33 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 @@ -5011,7 +5009,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5032,149 +5030,152 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 60(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -5534,13 +5535,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -5567,65 +5566,63 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: @@ -6221,33 +6218,31 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi @@ -6256,87 +6251,84 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index c3054a365c466..6b5c6049f025b 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -1635,22 +1635,22 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1807,40 +1807,43 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1906,13 +1909,13 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movb %cl, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2070,13 +2073,13 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movw %cx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2233,13 +2236,13 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movl %ecx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2521,10 +2524,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -2541,25 +2545,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp,%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> @@ -2667,21 +2672,21 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r10, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq @@ -2860,33 +2865,33 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 16(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 8(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp @@ -3026,9 +3031,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 @@ -3043,38 +3046,36 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -3304,7 +3305,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 @@ -3320,59 +3321,60 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 32(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) @@ -3380,7 +3382,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 7735500bd3a88..bed8e5806380c 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -1879,22 +1879,22 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: @@ -2055,40 +2055,43 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll index 4d261a9810896..9fbbba2ed3b47 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -820,7 +820,7 @@ define void @infiniteloop() { ; ENABLE-NEXT: movq %rsp, %rax ; ENABLE-NEXT: addq $-16, %rax ; ENABLE-NEXT: movq %rax, %rsp -; ENABLE-NEXT: xorl %ecx, %ecx +; ENABLE-NEXT: xorl %ecx, %ecx ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB10_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -851,8 +851,8 @@ define void @infiniteloop() { ; DISABLE-NEXT: ## %bb.1: ## %if.then ; DISABLE-NEXT: movq %rsp, %rax ; DISABLE-NEXT: addq $-16, %rax -; DISABLE-NEXT: %rax, %rsp -; DISABLE-NEXT: xorl %ecx, %ecx +; DISABLE-NEXT: movq %rax, %rsp +; DISABLE-NEXT: xorl %ecx, %ecx ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB10_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -1185,10 +1185,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 { ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB14_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; ENABLE-NEXT: cmpl %esi, %edi -; ENABLE-NEXT: setl %al +; ENABLE-NEXT: movl %esi, %eax ; ENABLE-NEXT: xorl %esi, %esi -; ENABLE-NEXT: movb %al, %sil +; ENABLE-NEXT: cmpl %eax, %edi +; ENABLE-NEXT: setl %sil ; ENABLE-NEXT: incb %dl ; ENABLE-NEXT: cmpb $45, %dl ; ENABLE-NEXT: jl LBB14_2 @@ -1220,10 +1220,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 { ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB14_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; DISABLE-NEXT: cmpl %esi, %edi -; DISABLE-NEXT: setl %al +; DISABLE-NEXT: movl %esi, %eax ; DISABLE-NEXT: xorl %esi, %esi -; DISABLE-NEXT: movb %al, %sil +; DISABLE-NEXT: cmpl %eax, %edi +; DISABLE-NEXT: setl %sil ; DISABLE-NEXT: incb %dl ; DISABLE-NEXT: cmpb $45, %dl ; DISABLE-NEXT: jl LBB14_2 diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index 2bef66825d8c0..59fbf7183abc6 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -62,12 +62,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB3_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -78,12 +78,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB3_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB3_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -94,12 +94,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB3_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB3_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -126,13 +126,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB4_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: testw %dx, %dx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: testw %cx, %cx ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -144,13 +144,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB4_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: testw %cx, %cx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %esi, %ecx +; X64-LIN-NEXT: xorl %ecx, %eax +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi +; X64-LIN-NEXT: testw %si, %si ; X64-LIN-NEXT: jne .LBB4_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -163,13 +163,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB4_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: testw %cx, %cx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %edx, %ecx +; X64-WIN-NEXT: xorl %ecx, %eax +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx +; X64-WIN-NEXT: testw %dx, %dx ; X64-WIN-NEXT: jne .LBB4_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -197,12 +197,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB5_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorb %cl, %al -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notb %dl -; X86-NEXT: andb %cl, %dl -; X86-NEXT: addb %dl, %dl -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notb %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: addb %cl, %cl ; X86-NEXT: jne .LBB5_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -213,12 +213,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB5_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorb %sil, %al -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notb %cl -; X64-LIN-NEXT: andb %sil, %cl -; X64-LIN-NEXT: addb %cl, %cl -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notb %sil +; X64-LIN-NEXT: andb %cl, %sil +; X64-LIN-NEXT: addb %sil, %sil ; X64-LIN-NEXT: jne .LBB5_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $al killed $al killed $eax @@ -230,12 +230,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB5_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorb %dl, %al -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notb %cl -; X64-WIN-NEXT: andb %dl, %cl -; X64-WIN-NEXT: addb %cl, %cl -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notb %dl +; X64-WIN-NEXT: andb %cl, %dl +; X64-WIN-NEXT: addb %dl, %dl ; X64-WIN-NEXT: jne .LBB5_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -262,12 +262,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB6_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -278,12 +278,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB6_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: xorl $2147483646, %esi # imm = 0x7FFFFFFE +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB6_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -294,12 +294,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB6_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB6_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s index c393d3e819880..3f6d8feb45df0 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s @@ -34,3 +34,83 @@ v_cvt_f32_bf16 v5, v1 div:2 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // GFX1250-ERR-NEXT:{{^}}v_cvt_f32_bf16 v5, v1 div:2 // GFX1250-ERR-NEXT:{{^}} ^ + +v_cos_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_cos_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_exp_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_exp_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_log_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_log_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rcp_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rcp_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rsq_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rsq_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sin_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sin_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sqrt_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sqrt_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_tanh_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_tanh_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 0931523bbf40c..37ad6eb249da4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -3781,15 +3781,6 @@ v_tanh_bf16_e64 v5, null v_tanh_bf16_e64 v5, -1 // GFX1250: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - -v_tanh_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - -v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_prng_b32_e64 v5, v1 // GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] @@ -3862,15 +3853,6 @@ v_rcp_bf16_e64 v5, null v_rcp_bf16_e64 v5, -1 // GFX1250: v_rcp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -v_rcp_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_rcp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - -v_rcp_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_rcp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - -v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sqrt_bf16_e64 v5, v1 // GFX1250: v_sqrt_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00] @@ -3907,15 +3889,6 @@ v_sqrt_bf16_e64 v5, null v_sqrt_bf16_e64 v5, -1 // GFX1250: v_sqrt_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -v_sqrt_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_sqrt_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - -v_sqrt_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_sqrt_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - -v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rsq_bf16_e64 v5, v1 // GFX1250: v_rsq_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00] @@ -3952,15 +3925,6 @@ v_rsq_bf16_e64 v5, null v_rsq_bf16_e64 v5, -1 // GFX1250: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -v_rsq_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - -v_rsq_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - -v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_log_bf16_e64 v5, v1 // GFX1250: v_log_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00] @@ -3997,15 +3961,6 @@ v_log_bf16_e64 v5, null v_log_bf16_e64 v5, -1 // GFX1250: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -v_log_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - -v_log_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - -v_log_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_exp_bf16_e64 v5, v1 // GFX1250: v_exp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00] @@ -4042,15 +3997,6 @@ v_exp_bf16_e64 v5, null v_exp_bf16_e64 v5, -1 // GFX1250: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -v_exp_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - -v_exp_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - -v_exp_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sin_bf16_e64 v5, v1 // GFX1250: v_sin_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00] @@ -4087,15 +4033,6 @@ v_sin_bf16_e64 v5, null v_sin_bf16_e64 v5, -1 // GFX1250: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -v_sin_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - -v_sin_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - -v_sin_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cos_bf16_e64 v5, v1 // GFX1250: v_cos_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00] @@ -4132,15 +4069,6 @@ v_cos_bf16_e64 v5, null v_cos_bf16_e64 v5, -1 // GFX1250: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -v_cos_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - -v_cos_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - -v_cos_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 5ac9eb47381d6..52f9ba3a99483 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -3952,15 +3952,6 @@ v_tanh_bf16_e64 v5.l, null v_tanh_bf16_e64 v5.l, -1 // GFX1250: v_tanh_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - -v_tanh_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - -v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_tanh_bf16 v5.l, v128.h // GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00] @@ -4036,15 +4027,6 @@ v_rcp_bf16_e64 v5.l, null v_rcp_bf16_e64 v5.l, -1 // GFX1250: v_rcp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -v_rcp_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_rcp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - -v_rcp_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_rcp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - -v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rcp_bf16 v5.h, v128.h // GFX1250: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00] @@ -4084,15 +4066,6 @@ v_sqrt_bf16_e64 v5.l, null v_sqrt_bf16_e64 v5.l, -1 // GFX1250: v_sqrt_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -v_sqrt_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_sqrt_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - -v_sqrt_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_sqrt_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - -v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sqrt_bf16 v5.h, v128.h // GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] @@ -4132,15 +4105,6 @@ v_rsq_bf16_e64 v5.l, null v_rsq_bf16_e64 v5.l, -1 // GFX1250: v_rsq_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -v_rsq_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_rsq_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - -v_rsq_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_rsq_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - -v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rsq_bf16 v5.h, v128.h // GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] @@ -4180,15 +4144,6 @@ v_log_bf16_e64 v5.l, null v_log_bf16_e64 v5.l, -1 // GFX1250: v_log_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -v_log_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_log_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - -v_log_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_log_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - -v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_log_bf16 v5.h, v128.h // GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] @@ -4228,15 +4183,6 @@ v_exp_bf16_e64 v5.l, null v_exp_bf16_e64 v5.l, -1 // GFX1250: v_exp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -v_exp_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_exp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - -v_exp_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_exp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - -v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_exp_bf16 v5.h, v128.h // GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] @@ -4276,15 +4222,6 @@ v_sin_bf16_e64 v5.l, null v_sin_bf16_e64 v5.l, -1 // GFX1250: v_sin_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -v_sin_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_sin_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - -v_sin_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_sin_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - -v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sin_bf16 v5.h, v128.h // GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] @@ -4324,15 +4261,6 @@ v_cos_bf16_e64 v5.l, null v_cos_bf16_e64 v5.l, -1 // GFX1250: v_cos_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -v_cos_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_cos_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - -v_cos_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_cos_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - -v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cos_bf16_e64 v5.h, v128.h // GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index b21fca654590a..21077fe4f9f05 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -158,18 +158,6 @@ v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -258,18 +246,6 @@ v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -314,18 +290,6 @@ v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -370,18 +334,6 @@ v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -426,18 +378,6 @@ v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -482,18 +422,6 @@ v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -538,18 +466,6 @@ v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -594,18 +510,6 @@ v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index d1638565a386a..646acf5219d7e 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -162,18 +162,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -266,18 +254,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rcp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -326,18 +302,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -386,18 +350,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -446,18 +398,6 @@ v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -506,18 +446,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -566,18 +494,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -626,18 +542,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index 78afa10b984cb..1907a939b488b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -38,18 +38,6 @@ v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -58,114 +46,30 @@ v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 6ec4d5f48f8b1..35a51dbe9f922 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -42,18 +42,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -66,18 +54,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rcp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -86,18 +62,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -106,18 +70,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -126,18 +78,6 @@ v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -146,18 +86,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -166,18 +94,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -186,18 +102,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 67747a65ee52a..0b393973b7875 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -4123,18 +4123,10 @@ # GFX1250-REAL16: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_tanh_f16_e64 v5, v128 ; encoding: [0x05,0x00,0x9f,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] @@ -4159,10 +4151,6 @@ # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] @@ -4223,18 +4211,10 @@ 0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00 # GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00] -0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_rcp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00] @@ -4259,10 +4239,6 @@ # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_rcp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00] @@ -4287,18 +4263,10 @@ # GFX1250-REAL16: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xf9,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00] @@ -4323,10 +4291,6 @@ # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00] @@ -4351,18 +4315,10 @@ # GFX1250-REAL16: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfa,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] @@ -4387,10 +4343,6 @@ # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] @@ -4415,18 +4367,10 @@ # GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_log_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] @@ -4451,10 +4395,6 @@ # GFX1250-REAL16: v_log_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_log_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] @@ -4479,18 +4419,10 @@ # GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_exp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] @@ -4515,10 +4447,6 @@ # GFX1250-REAL16: v_exp_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_exp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] @@ -4543,18 +4471,10 @@ # GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_sin_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] @@ -4579,10 +4499,6 @@ # GFX1250-REAL16: v_sin_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_sin_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] @@ -4607,18 +4523,10 @@ # GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_cos_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] @@ -4643,10 +4551,6 @@ # GFX1250-REAL16: v_cos_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_cos_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 7c29f8ab01a1b..8b26d2a8696e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -104,18 +104,6 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -197,18 +185,6 @@ 0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff # GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -257,18 +233,6 @@ # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -317,18 +281,6 @@ # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -377,18 +329,6 @@ # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -437,18 +377,6 @@ # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -497,18 +425,6 @@ # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -557,18 +473,6 @@ # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index d26bc46a1f272..15f76c54a1c65 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -34,22 +34,10 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] @@ -57,142 +45,58 @@ 0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll index c12d8135e5eba..082b876b542e5 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -234,16 +234,17 @@ define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %r ; X32-NEXT: .p2align 4 ; X32-NEXT: .LBB2_2: # %for.body ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl (%ebx,%esi), %ebp -; X32-NEXT: addl (%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: movl %ebp, (%edx) -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: movl (%ebx,%esi), %ebx +; X32-NEXT: addl (%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: movl %ebx, (%edx) +; X32-NEXT: leal (%ebp,%esi), %ebx ; X32-NEXT: addl %edi, %ebx ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: decl %eax diff --git a/llvm/test/tools/llc/save-stats.ll b/llvm/test/tools/llc/save-stats.ll index 4950625c809cc..a5769f86648dc 100644 --- a/llvm/test/tools/llc/save-stats.ll +++ b/llvm/test/tools/llc/save-stats.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts +; RUN: rm -rf %t.dir && mkdir -p %t.dir && cd %t.dir ; RUN: llc --save-stats=obj -o %t.s %s && cat %t.stats | FileCheck %s ; RUN: llc --save-stats=cwd -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s ; RUN: llc --save-stats -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 0180ba0a6c163..783267aa05b59 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -76,8 +76,10 @@ public: }; class BogusTargetInstrInfo : public TargetInstrInfo { + BogusRegisterInfo RegInfo; + public: - BogusTargetInstrInfo() : TargetInstrInfo() {} + BogusTargetInstrInfo() : TargetInstrInfo(RegInfo) {} }; class BogusSubtarget : public TargetSubtargetInfo { diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index ee3cd8c20f8e7..32994c12aa98b 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -1103,7 +1103,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) { Twine ClassName = TargetName + "GenInstrInfo"; OS << "struct " << ClassName << " : public TargetInstrInfo {\n" << " explicit " << ClassName - << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode = ~0u, " + << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, " + "unsigned CFSetupOpcode = ~0u, " "unsigned CFDestroyOpcode = ~0u, " "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n" << " ~" << ClassName << "() override = default;\n" @@ -1157,9 +1158,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) { << TargetName << "InstrComplexDeprecationInfos[];\n"; Twine ClassName = TargetName + "GenInstrInfo"; OS << ClassName << "::" << ClassName - << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode, unsigned " + << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, " + "unsigned CFSetupOpcode, unsigned " "CFDestroyOpcode, unsigned CatchRetOpcode, unsigned ReturnOpcode)\n" - << " : TargetInstrInfo(CFSetupOpcode, CFDestroyOpcode, CatchRetOpcode, " + << " : TargetInstrInfo(TRI, CFSetupOpcode, CFDestroyOpcode, " + "CatchRetOpcode, " "ReturnOpcode"; if (NumClassesByHwMode != 0) OS << ", " << TargetName diff --git a/revert_patches.txt b/revert_patches.txt index b88b846f64b68..0e7a66132e1a5 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -5,6 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485) breaks build of ROCmValidationSuite [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662) --- -breaks fortran declare-target-link1 -[OMPIRBuilder] Fix addrspace of internal critical section lock (#166459 +build issues +[OpenMP][Clang] Add codegen support for dyn_groupprivate clause (#152830) --- diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 4d279bffd3723..deb56dc0957e9 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1231,6 +1231,7 @@ cc_library( ":format", ":frontend", ":lex", + ":options", ":rewrite", ":support", ":tooling_core", @@ -1501,12 +1502,25 @@ cc_library( gentbl_cc_library( name = "driver_options_inc_gen", - tbl_outs = {"include/clang/Driver/Options.inc": ["-gen-opt-parser-defs"]}, + tbl_outs = {"include/clang/Options/Options.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", - td_file = "include/clang/Driver/Options.td", + td_file = "include/clang/Options/Options.td", deps = ["//llvm:OptParserTdFiles"], ) +cc_library( + name = "options", + srcs = glob(["lib/Options/*.cpp"]), + hdrs = glob(["include/clang/Options/*.h"]), + includes = ["include"], + deps = [ + ":basic", + ":driver_options_inc_gen", + ":static_analyzer_checkers_gen", + "//llvm:Option", + ], +) + cc_library( name = "driver", srcs = glob( @@ -1544,6 +1558,7 @@ cc_library( ":config", ":driver_options_inc_gen", ":lex", + ":options", ":parse", ":static_analyzer_checkers_gen", "//llvm:BinaryFormat", @@ -1700,6 +1715,7 @@ cc_library( ":driver_options_inc_gen", ":edit", ":lex", + ":options", ":parse", ":sema", ":serialization", @@ -1769,6 +1785,7 @@ cc_library( ":frontend", ":frontend_tool", ":lex", + ":options", ":parse", ":sema", ":serialization", @@ -2001,6 +2018,7 @@ cc_library( ":extract_api", ":frontend", ":frontend_rewrite", + ":options", ":static_analyzer_frontend", "//llvm:Option", "//llvm:Support", @@ -2176,6 +2194,7 @@ cc_library( ":frontend_rewrite", ":frontend_tool", ":lex", + ":options", ":parse", ":sema", ":serialization", @@ -2258,6 +2277,7 @@ cc_binary( ":driver", ":frontend", ":frontend_rewrite", + ":options", ":serialization", ":static_analyzer_frontend", ":tooling",