From f63d33da0a517a9a7096e0e67defb50c5995dd41 Mon Sep 17 00:00:00 2001 From: Naveen Seth Hanig Date: Mon, 10 Nov 2025 21:24:39 +0100 Subject: [PATCH 01/30] Reland "[clang] Refactor option-related code from clangDriver into new clangOptions library" (#167374) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This relands #167348. The original PR was reverted due to a reported build failure, which was later diagnosed as a local issue in the developer’s checkout or build state. See discussion here: https://github.com/llvm/llvm-project/pull/163659#discussion_r2511546964 No additional changes have been made in this reland. --- clang-tools-extra/clangd/CMakeLists.txt | 1 + clang-tools-extra/clangd/CompileCommands.cpp | 27 +- clang-tools-extra/modularize/CMakeLists.txt | 1 + .../modularize/CoverageChecker.cpp | 6 +- clang-tools-extra/modularize/Modularize.cpp | 4 +- .../modularize/ModularizeUtilities.cpp | 6 +- clang-tools-extra/pp-trace/CMakeLists.txt | 1 + clang-tools-extra/pp-trace/PPTrace.cpp | 2 +- clang/docs/CMakeLists.txt | 2 +- clang/docs/InternalsManual.rst | 8 +- clang/docs/ReleaseNotes.rst | 3 + clang/include/clang/CMakeLists.txt | 2 +- clang/include/clang/Driver/Driver.h | 2 +- clang/include/clang/Frontend/Utils.h | 2 +- .../clang/{Driver => Options}/CMakeLists.txt | 0 .../{Driver => Options}/ClangOptionDocs.td | 0 .../clang/{Driver => Options}/OptionUtils.h | 6 +- .../clang/{Driver => Options}/Options.h | 20 +- .../clang/{Driver => Options}/Options.td | 0 clang/include/module.modulemap | 1 + clang/lib/CMakeLists.txt | 1 + clang/lib/Driver/CMakeLists.txt | 3 +- clang/lib/Driver/Compilation.cpp | 2 +- clang/lib/Driver/Driver.cpp | 2 +- clang/lib/Driver/SanitizerArgs.cpp | 2 +- clang/lib/Driver/ToolChain.cpp | 6 +- clang/lib/Driver/ToolChains/AIX.cpp | 6 +- clang/lib/Driver/ToolChains/AMDGPU.cpp | 21 +- clang/lib/Driver/ToolChains/AMDGPU.h | 2 +- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 2 +- clang/lib/Driver/ToolChains/AVR.cpp | 2 +- clang/lib/Driver/ToolChains/Arch/AArch64.cpp | 4 +- clang/lib/Driver/ToolChains/Arch/ARM.cpp | 4 +- clang/lib/Driver/ToolChains/Arch/CSKY.cpp | 6 +- .../lib/Driver/ToolChains/Arch/LoongArch.cpp | 5 +- clang/lib/Driver/ToolChains/Arch/M68k.cpp | 16 +- clang/lib/Driver/ToolChains/Arch/Mips.cpp | 5 +- clang/lib/Driver/ToolChains/Arch/PPC.cpp | 2 +- clang/lib/Driver/ToolChains/Arch/RISCV.cpp | 2 +- clang/lib/Driver/ToolChains/Arch/Sparc.cpp | 4 +- clang/lib/Driver/ToolChains/Arch/SystemZ.cpp | 10 +- clang/lib/Driver/ToolChains/Arch/VE.cpp | 2 +- clang/lib/Driver/ToolChains/Arch/X86.cpp | 18 +- clang/lib/Driver/ToolChains/BareMetal.cpp | 4 +- clang/lib/Driver/ToolChains/CSKYToolChain.cpp | 2 +- clang/lib/Driver/ToolChains/Clang.cpp | 13 +- clang/lib/Driver/ToolChains/CommonArgs.cpp | 24 +- clang/lib/Driver/ToolChains/CrossWindows.cpp | 2 +- clang/lib/Driver/ToolChains/Cuda.cpp | 8 +- clang/lib/Driver/ToolChains/Cygwin.cpp | 4 +- clang/lib/Driver/ToolChains/Darwin.cpp | 6 +- clang/lib/Driver/ToolChains/DragonFly.cpp | 4 +- clang/lib/Driver/ToolChains/Flang.cpp | 4 +- clang/lib/Driver/ToolChains/FreeBSD.cpp | 4 +- clang/lib/Driver/ToolChains/Fuchsia.cpp | 4 +- clang/lib/Driver/ToolChains/Gnu.cpp | 10 +- clang/lib/Driver/ToolChains/HIPAMD.cpp | 2 +- clang/lib/Driver/ToolChains/HIPSPV.cpp | 4 +- clang/lib/Driver/ToolChains/HIPUtility.cpp | 2 +- clang/lib/Driver/ToolChains/Hexagon.cpp | 2 +- clang/lib/Driver/ToolChains/Hurd.cpp | 4 +- clang/lib/Driver/ToolChains/Linux.cpp | 4 +- clang/lib/Driver/ToolChains/MSP430.cpp | 2 +- clang/lib/Driver/ToolChains/MSVC.cpp | 2 +- clang/lib/Driver/ToolChains/Managarm.cpp | 4 +- clang/lib/Driver/ToolChains/MinGW.cpp | 2 +- clang/lib/Driver/ToolChains/MipsLinux.cpp | 4 +- clang/lib/Driver/ToolChains/NetBSD.cpp | 4 +- clang/lib/Driver/ToolChains/OHOS.cpp | 4 +- clang/lib/Driver/ToolChains/OpenBSD.cpp | 4 +- clang/lib/Driver/ToolChains/PPCFreeBSD.cpp | 4 +- clang/lib/Driver/ToolChains/PPCLinux.cpp | 4 +- clang/lib/Driver/ToolChains/PS4CPU.cpp | 2 +- clang/lib/Driver/ToolChains/SPIRV.cpp | 2 +- clang/lib/Driver/ToolChains/SYCL.cpp | 2 +- clang/lib/Driver/ToolChains/Solaris.cpp | 4 +- clang/lib/Driver/ToolChains/UEFI.cpp | 2 +- clang/lib/Driver/ToolChains/VEToolchain.cpp | 6 +- clang/lib/Driver/ToolChains/WebAssembly.cpp | 6 +- clang/lib/Driver/ToolChains/XCore.cpp | 6 +- clang/lib/Driver/ToolChains/ZOS.cpp | 2 +- clang/lib/Driver/XRayArgs.cpp | 2 +- clang/lib/Frontend/CMakeLists.txt | 1 + clang/lib/Frontend/CompilerInvocation.cpp | 54 +- .../CreateInvocationFromCommandLine.cpp | 6 +- clang/lib/FrontendTool/CMakeLists.txt | 1 + .../ExecuteCompilerInvocation.cpp | 6 +- clang/lib/Interpreter/Interpreter.cpp | 4 +- clang/lib/Interpreter/InterpreterUtils.h | 2 +- clang/lib/Options/CMakeLists.txt | 18 + .../lib/{Driver => Options}/DriverOptions.cpp | 19 +- clang/lib/{Driver => Options}/OptionUtils.cpp | 2 +- clang/lib/Tooling/CMakeLists.txt | 1 + .../InterpolatingCompilationDatabase.cpp | 12 +- clang/lib/Tooling/Tooling.cpp | 12 +- clang/tools/clang-check/CMakeLists.txt | 1 + clang/tools/clang-check/ClangCheck.cpp | 4 +- clang/tools/clang-installapi/CMakeLists.txt | 1 + .../clang-installapi/ClangInstallAPI.cpp | 4 +- clang/tools/clang-installapi/Options.cpp | 55 +- clang/tools/driver/CMakeLists.txt | 1 + clang/tools/driver/cc1_main.cpp | 2 +- clang/tools/driver/cc1as_main.cpp | 8 +- clang/tools/driver/driver.cpp | 2 +- clang/unittests/Driver/DXCModeTest.cpp | 4 +- clang/www/OpenProjects.html | 2 +- flang/docs/CMakeLists.txt | 2 +- flang/docs/FlangDriver.md | 8 +- flang/lib/Frontend/CMakeLists.txt | 1 + flang/lib/Frontend/CompilerInvocation.cpp | 547 ++++++++---------- flang/lib/FrontendTool/CMakeLists.txt | 1 + .../ExecuteCompilerInvocation.cpp | 6 +- flang/tools/flang-driver/CMakeLists.txt | 1 + flang/tools/flang-driver/driver.cpp | 4 +- .../Platform/MacOSX/PlatformDarwin.cpp | 4 +- llvm/include/llvm/MC/MCAsmInfo.h | 2 +- llvm/include/llvm/Option/Arg.h | 2 +- .../llvm-project-overlay/clang/BUILD.bazel | 4 +- 118 files changed, 590 insertions(+), 610 deletions(-) rename clang/include/clang/{Driver => Options}/CMakeLists.txt (100%) rename clang/include/clang/{Driver => Options}/ClangOptionDocs.td (100%) rename clang/include/clang/{Driver => Options}/OptionUtils.h (94%) rename clang/include/clang/{Driver => Options}/Options.h (83%) rename clang/include/clang/{Driver => Options}/Options.td (100%) create mode 100644 clang/lib/Options/CMakeLists.txt rename clang/lib/{Driver => Options}/DriverOptions.cpp (76%) rename clang/lib/{Driver => Options}/OptionUtils.cpp (97%) diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt index fb3f05329be21..d7ec853af862f 100644 --- a/clang-tools-extra/clangd/CMakeLists.txt +++ b/clang-tools-extra/clangd/CMakeLists.txt @@ -165,6 +165,7 @@ clang_target_link_libraries(clangDaemon clangBasic clangDependencyScanning clangDriver + clangOptions clangFormat clangFrontend clangIndex diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp index c1be93730129a..7990f2719e9a0 100644 --- a/clang-tools-extra/clangd/CompileCommands.cpp +++ b/clang-tools-extra/clangd/CompileCommands.cpp @@ -11,8 +11,8 @@ #include "support/Logger.h" #include "support/Trace.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInvocation.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/ADT/ArrayRef.h" @@ -206,7 +206,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, if (Cmd.empty()) return; - auto &OptTable = clang::driver::getDriverOptTable(); + auto &OptTable = getDriverOptTable(); // OriginalArgs needs to outlive ArgList. llvm::SmallVector OriginalArgs; OriginalArgs.reserve(Cmd.size()); @@ -222,8 +222,8 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, llvm::opt::InputArgList ArgList; ArgList = OptTable.ParseArgs( llvm::ArrayRef(OriginalArgs).drop_front(), IgnoredCount, IgnoredCount, - llvm::opt::Visibility(IsCLMode ? driver::options::CLOption - : driver::options::ClangOption)); + llvm::opt::Visibility(IsCLMode ? options::CLOption + : options::ClangOption)); llvm::SmallVector IndicesToDrop; // Having multiple architecture options (e.g. when building fat binaries) @@ -232,7 +232,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, // As there are no signals to figure out which one user actually wants. They // can explicitly specify one through `CompileFlags.Add` if need be. unsigned ArchOptCount = 0; - for (auto *Input : ArgList.filtered(driver::options::OPT_arch)) { + for (auto *Input : ArgList.filtered(options::OPT_arch)) { ++ArchOptCount; for (auto I = 0U; I <= Input->getNumValues(); ++I) IndicesToDrop.push_back(Input->getIndex() + I); @@ -262,13 +262,12 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, // explicitly at the end of the flags. This ensures modifications done in the // following steps apply in more cases (like setting -x, which only affects // inputs that come after it). - for (auto *Input : ArgList.filtered(driver::options::OPT_INPUT)) { + for (auto *Input : ArgList.filtered(options::OPT_INPUT)) { SawInput(Input->getValue(0)); IndicesToDrop.push_back(Input->getIndex()); } // Anything after `--` is also treated as input, drop them as well. - if (auto *DashDash = - ArgList.getLastArgNoClaim(driver::options::OPT__DASH_DASH)) { + if (auto *DashDash = ArgList.getLastArgNoClaim(options::OPT__DASH_DASH)) { auto DashDashIndex = DashDash->getIndex() + 1; // +1 accounts for Cmd[0] // Another +1 so we don't treat the `--` itself as an input. for (unsigned I = DashDashIndex + 1; I < Cmd.size(); ++I) @@ -424,11 +423,11 @@ DriverMode getDriverMode(const std::vector &Args) { // Returns the set of DriverModes where an option may be used. unsigned char getModes(const llvm::opt::Option &Opt) { unsigned char Result = DM_None; - if (Opt.hasVisibilityFlag(driver::options::ClangOption)) + if (Opt.hasVisibilityFlag(options::ClangOption)) Result |= DM_GCC; - if (Opt.hasVisibilityFlag(driver::options::CC1Option)) + if (Opt.hasVisibilityFlag(options::CC1Option)) Result |= DM_CC1; - if (Opt.hasVisibilityFlag(driver::options::CLOption)) + if (Opt.hasVisibilityFlag(options::CLOption)) Result |= DM_CL; return Result; } @@ -442,8 +441,8 @@ llvm::ArrayRef ArgStripper::rulesFor(llvm::StringRef Arg) { using TableTy = llvm::StringMap, llvm::BumpPtrAllocator>; static TableTy *Table = [] { - auto &DriverTable = driver::getDriverOptTable(); - using DriverID = clang::driver::options::ID; + auto &DriverTable = getDriverOptTable(); + using DriverID = clang::options::ID; // Collect sets of aliases, so we can treat -foo and -foo= as synonyms. // Conceptually a double-linked list: PrevAlias[I] -> I -> NextAlias[I]. @@ -468,7 +467,7 @@ llvm::ArrayRef ArgStripper::rulesFor(llvm::StringRef Arg) { FLAGS, VISIBILITY, PARAM, HELPTEXT, HELPTEXTSFORVARIANTS, \ METAVAR, VALUES, SUBCOMMANDIDS_OFFSET) \ {DriverID::OPT_##ID, DriverID::OPT_##ALIAS, ALIASARGS}, -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTION }; for (auto &E : AliasTable) diff --git a/clang-tools-extra/modularize/CMakeLists.txt b/clang-tools-extra/modularize/CMakeLists.txt index eb5383c3ad44e..a775b790a3147 100644 --- a/clang-tools-extra/modularize/CMakeLists.txt +++ b/clang-tools-extra/modularize/CMakeLists.txt @@ -20,6 +20,7 @@ clang_target_link_libraries(modularize clangAST clangBasic clangDriver + clangOptions clangFrontend clangLex clangSerialization diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp index 1345a6ef8f489..d80d78c64c6e2 100644 --- a/clang-tools-extra/modularize/CoverageChecker.cpp +++ b/clang-tools-extra/modularize/CoverageChecker.cpp @@ -50,18 +50,18 @@ // //===----------------------------------------------------------------------===// +#include "CoverageChecker.h" #include "ModularizeUtilities.h" #include "clang/AST/ASTConsumer.h" -#include "CoverageChecker.h" #include "clang/AST/ASTContext.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/PPCallbacks.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Option.h" @@ -73,7 +73,7 @@ using namespace Modularize; using namespace clang; using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace clang::tooling; namespace cl = llvm::cl; namespace sys = llvm::sys; diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp index 376ad0c7875bf..33966b44f719a 100644 --- a/clang-tools-extra/modularize/Modularize.cpp +++ b/clang-tools-extra/modularize/Modularize.cpp @@ -231,11 +231,11 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Arg.h" @@ -254,7 +254,7 @@ using namespace clang; using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace clang::tooling; using namespace llvm; using namespace llvm::opt; diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp index 4dd84feac5df4..6978a6b2fe1b7 100644 --- a/clang-tools-extra/modularize/ModularizeUtilities.cpp +++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp @@ -12,17 +12,17 @@ // //===----------------------------------------------------------------------===// +#include "ModularizeUtilities.h" +#include "CoverageChecker.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendActions.h" -#include "CoverageChecker.h" +#include "clang/Options/Options.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" -#include "ModularizeUtilities.h" using namespace clang; using namespace llvm; diff --git a/clang-tools-extra/pp-trace/CMakeLists.txt b/clang-tools-extra/pp-trace/CMakeLists.txt index 1323adbc35269..da36582ee0234 100644 --- a/clang-tools-extra/pp-trace/CMakeLists.txt +++ b/clang-tools-extra/pp-trace/CMakeLists.txt @@ -14,6 +14,7 @@ clang_target_link_libraries(pp-trace PRIVATE clangAST clangBasic + clangOptions clangFrontend clangLex clangSerialization diff --git a/clang-tools-extra/pp-trace/PPTrace.cpp b/clang-tools-extra/pp-trace/PPTrace.cpp index 0b078c49a55b7..ba5a06a26830d 100644 --- a/clang-tools-extra/pp-trace/PPTrace.cpp +++ b/clang-tools-extra/pp-trace/PPTrace.cpp @@ -28,11 +28,11 @@ #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/Basic/SourceManager.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Options/Options.h" #include "clang/Tooling/Execution.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Arg.h" diff --git a/clang/docs/CMakeLists.txt b/clang/docs/CMakeLists.txt index 1f06c040c96cb..9469a832adb62 100644 --- a/clang/docs/CMakeLists.txt +++ b/clang/docs/CMakeLists.txt @@ -132,7 +132,7 @@ if (LLVM_ENABLE_SPHINX) # Generated files gen_rst_file_from_td(AttributeReference.rst -gen-attr-docs ../include/clang/Basic/Attr.td "${docs_targets}") gen_rst_file_from_td(DiagnosticsReference.rst -gen-diag-docs ../include/clang/Basic/Diagnostic.td "${docs_targets}") - gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Driver/ClangOptionDocs.td "${docs_targets}") + gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Options/ClangOptionDocs.td "${docs_targets}") # Another generated file from a different source set(docs_tools_dir ${CMAKE_CURRENT_SOURCE_DIR}/tools) diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst index eff46ab46e1ca..a849d05eb7ae9 100644 --- a/clang/docs/InternalsManual.rst +++ b/clang/docs/InternalsManual.rst @@ -667,7 +667,7 @@ Command Line Interface ---------------------- The command line interface of the Clang ``-cc1`` frontend is defined alongside -the driver options in ``clang/Driver/Options.td``. The information making up an +the driver options in ``clang/Options/Options.td``. The information making up an option definition includes its prefix and name (for example ``-std=``), form and position of the option value, help text, aliases and more. Each option may belong to a certain group and can be marked with zero or more flags. Options @@ -712,7 +712,7 @@ variable for the option value: } Next, declare the command line interface of the option in the tablegen file -``clang/include/clang/Driver/Options.td``. This is done by instantiating the +``clang/include/clang/Options/Options.td``. This is done by instantiating the ``Option`` class (defined in ``llvm/include/llvm/Option/OptParser.td``). The instance is typically created through one of the helper classes that encode the acceptable ways to specify the option value on the command line: @@ -906,7 +906,7 @@ command line: SHOULD_PARSE, KEYPATH, DEFAULT_VALUE, \ IMPLIED_CHECK, IMPLIED_VALUE, NORMALIZER, \ MERGER, TABLE_INDEX) - #include "clang/Driver/Options.inc" + #include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // ... @@ -925,7 +925,7 @@ command line: GENERATE_OPTION_WITH_MARSHALLING( \ Args, SA, KIND, FLAGS, SPELLING, ALWAYS_EMIT, KEYPATH, DEFAULT_VALUE, \ IMPLIED_CHECK, IMPLIED_VALUE, DENORMALIZER, EXTRACTOR, TABLE_INDEX) - #include "clang/Driver/Options.inc" + #include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // ... diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e9c0723f6787f..c43f236aab319 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -79,6 +79,9 @@ Potentially Breaking Changes void foo(void) { return ({ 1;; }); } +- Downstream projects that previously linked only against ``clangDriver`` may + now (also) need to link against the new ``clangOptions`` library, since + options-related code has been moved out of the Driver into a separate library. C/C++ Language Potentially Breaking Changes ------------------------------------------- diff --git a/clang/include/clang/CMakeLists.txt b/clang/include/clang/CMakeLists.txt index 47ac70cd21690..77a44e4c48de5 100644 --- a/clang/include/clang/CMakeLists.txt +++ b/clang/include/clang/CMakeLists.txt @@ -3,7 +3,7 @@ add_subdirectory(Basic) if(CLANG_ENABLE_CIR) add_subdirectory(CIR) endif() -add_subdirectory(Driver) +add_subdirectory(Options) add_subdirectory(Parse) add_subdirectory(Sema) add_subdirectory(Serialization) diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index b9b187ada8add..aa86bffb802a4 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -15,11 +15,11 @@ #include "clang/Driver/Action.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Phases.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Types.h" #include "clang/Driver/Util.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/StringMap.h" diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h index 49fd920d1ec43..ed2703c76f18d 100644 --- a/clang/include/clang/Frontend/Utils.h +++ b/clang/include/clang/Frontend/Utils.h @@ -15,8 +15,8 @@ #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LLVM.h" -#include "clang/Driver/OptionUtils.h" #include "clang/Frontend/DependencyOutputOptions.h" +#include "clang/Options/OptionUtils.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/StringMap.h" diff --git a/clang/include/clang/Driver/CMakeLists.txt b/clang/include/clang/Options/CMakeLists.txt similarity index 100% rename from clang/include/clang/Driver/CMakeLists.txt rename to clang/include/clang/Options/CMakeLists.txt diff --git a/clang/include/clang/Driver/ClangOptionDocs.td b/clang/include/clang/Options/ClangOptionDocs.td similarity index 100% rename from clang/include/clang/Driver/ClangOptionDocs.td rename to clang/include/clang/Options/ClangOptionDocs.td diff --git a/clang/include/clang/Driver/OptionUtils.h b/clang/include/clang/Options/OptionUtils.h similarity index 94% rename from clang/include/clang/Driver/OptionUtils.h rename to clang/include/clang/Options/OptionUtils.h index 922f536bf33ea..83c48bd7d6843 100644 --- a/clang/include/clang/Driver/OptionUtils.h +++ b/clang/include/clang/Options/OptionUtils.h @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_DRIVER_OPTIONUTILS_H -#define LLVM_CLANG_DRIVER_OPTIONUTILS_H +#ifndef LLVM_CLANG_OPTIONS_OPTIONUTILS_H +#define LLVM_CLANG_OPTIONS_OPTIONUTILS_H #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LLVM.h" @@ -55,4 +55,4 @@ inline uint64_t getLastArgUInt64Value(const llvm::opt::ArgList &Args, } // namespace clang -#endif // LLVM_CLANG_DRIVER_OPTIONUTILS_H +#endif // LLVM_CLANG_OPTIONS_OPTIONUTILS_H diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Options/Options.h similarity index 83% rename from clang/include/clang/Driver/Options.h rename to clang/include/clang/Options/Options.h index 0797410e9940e..ac98699001965 100644 --- a/clang/include/clang/Driver/Options.h +++ b/clang/include/clang/Options/Options.h @@ -6,14 +6,13 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_DRIVER_OPTIONS_H -#define LLVM_CLANG_DRIVER_OPTIONS_H +#ifndef LLVM_CLANG_OPTIONS_OPTIONS_H +#define LLVM_CLANG_OPTIONS_OPTIONS_H #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" namespace clang { -namespace driver { namespace options { /// Flags specifically for clang options. Must not overlap with @@ -42,16 +41,15 @@ enum ClangVisibility { }; enum ID { - OPT_INVALID = 0, // This is not an option ID. + OPT_INVALID = 0, // This is not an option ID. #define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__), -#include "clang/Driver/Options.inc" - LastOption +#include "clang/Options/Options.inc" + LastOption #undef OPTION - }; -} +}; +} // namespace options const llvm::opt::OptTable &getDriverOptTable(); -} -} +} // namespace clang -#endif +#endif // LLVM_CLANG_OPTIONS_OPTIONS_H diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Options/Options.td similarity index 100% rename from clang/include/clang/Driver/Options.td rename to clang/include/clang/Options/Options.td diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap index c5535262ae38c..a11c8683c601e 100644 --- a/clang/include/module.modulemap +++ b/clang/include/module.modulemap @@ -146,6 +146,7 @@ module Clang_Lex { module * { export * } } +module Clang_Options { requires cplusplus umbrella "clang/Options" module * { export * } } module Clang_Parse { requires cplusplus umbrella "clang/Parse" module * { export * } } module Clang_Rewrite { requires cplusplus umbrella "clang/Rewrite/Core" module * { export * } } module Clang_RewriteFrontend { requires cplusplus umbrella "clang/Rewrite/Frontend" module * { export * } } diff --git a/clang/lib/CMakeLists.txt b/clang/lib/CMakeLists.txt index 4f2218b583e41..e90b009da606a 100644 --- a/clang/lib/CMakeLists.txt +++ b/clang/lib/CMakeLists.txt @@ -13,6 +13,7 @@ add_subdirectory(Edit) add_subdirectory(ExtractAPI) add_subdirectory(Rewrite) add_subdirectory(Driver) +add_subdirectory(Options) add_subdirectory(Serialization) add_subdirectory(Frontend) add_subdirectory(FrontendTool) diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt index 7c4f70b966c48..8052659e9836b 100644 --- a/clang/lib/Driver/CMakeLists.txt +++ b/clang/lib/Driver/CMakeLists.txt @@ -19,12 +19,10 @@ add_clang_library(clangDriver Compilation.cpp Distro.cpp Driver.cpp - DriverOptions.cpp Job.cpp Multilib.cpp MultilibBuilder.cpp OffloadBundler.cpp - OptionUtils.cpp Phases.cpp SanitizerArgs.cpp Tool.cpp @@ -99,5 +97,6 @@ add_clang_library(clangDriver LINK_LIBS clangBasic clangLex + clangOptions ${system_libs} ) diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp index 4e300316ae9ba..f8ca2a3d09407 100644 --- a/clang/lib/Driver/Compilation.cpp +++ b/clang/lib/Driver/Compilation.cpp @@ -11,9 +11,9 @@ #include "clang/Driver/Action.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/OptSpecifier.h" #include "llvm/Option/Option.h" diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index a0b82cec9a372..9fd64d4aac514 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -60,13 +60,13 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Phases.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Types.h" #include "clang/Lex/DependencyDirectivesScanner.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 5dd48f53b9069..420c4cddbc8dd 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -8,8 +8,8 @@ #include "clang/Driver/SanitizerArgs.h" #include "clang/Basic/Sanitizers.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 7f70f1c444eb9..5ff7d83946137 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -21,9 +21,9 @@ #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/XRayArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" @@ -338,7 +338,7 @@ static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple, Multilib::flags_list ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const { - using namespace clang::driver::options; + using namespace clang::options; std::vector Result; const llvm::Triple Triple(ComputeEffectiveClangTriple(Args)); @@ -1805,7 +1805,7 @@ void ToolChain::TranslateXarchArgs( unsigned Index = BaseArgs.MakeIndex(A->getValue(ValuePos)); unsigned Prev = Index; std::unique_ptr XarchArg(Opts.ParseOneArg( - Args, Index, llvm::opt::Visibility(clang::driver::options::ClangOption))); + Args, Index, llvm::opt::Visibility(options::ClangOption))); // If the argument parsing failed or more than one argument was // consumed, the -Xarch_ argument's parameter tried to consume diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp index 066b59305fe3f..a8acf9cfc44c9 100644 --- a/clang/lib/Driver/ToolChains/AIX.cpp +++ b/clang/lib/Driver/ToolChains/AIX.cpp @@ -9,8 +9,8 @@ #include "AIX.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" @@ -19,6 +19,7 @@ #include using AIX = clang::driver::toolchains::AIX; +using namespace clang; using namespace clang::driver; using namespace clang::driver::tools; using namespace clang::driver::toolchains; @@ -167,8 +168,7 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA, Args.hasArg(options::OPT_coverage)) CmdArgs.push_back("-bdbg:namedsects:ss"); - if (Arg *A = - Args.getLastArg(clang::driver::options::OPT_mxcoff_build_id_EQ)) { + if (Arg *A = Args.getLastArg(options::OPT_mxcoff_build_id_EQ)) { StringRef BuildId = A->getValue(); if (BuildId[0] != '0' || BuildId[1] != 'x' || BuildId.find_if_not(llvm::isHexDigit, 2) != StringRef::npos) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 1a243fef9532d..9dc2c6ce39ae4 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Error.h" @@ -323,27 +323,24 @@ RocmInstallationDetector::RocmInstallationDetector( const llvm::opt::ArgList &Args, bool DetectHIPRuntime, bool DetectDeviceLib) : D(D) { Verbose = Args.hasArg(options::OPT_v); - RocmPathArg = Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ); - PrintROCmSearchDirs = - Args.hasArg(clang::driver::options::OPT_print_rocm_search_dirs); + RocmPathArg = Args.getLastArgValue(options::OPT_rocm_path_EQ); + PrintROCmSearchDirs = Args.hasArg(options::OPT_print_rocm_search_dirs); RocmDeviceLibPathArg = - Args.getAllArgValues(clang::driver::options::OPT_rocm_device_lib_path_EQ); - HIPPathArg = Args.getLastArgValue(clang::driver::options::OPT_hip_path_EQ); - HIPStdParPathArg = - Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_path_EQ); + Args.getAllArgValues(options::OPT_rocm_device_lib_path_EQ); + HIPPathArg = Args.getLastArgValue(options::OPT_hip_path_EQ); + HIPStdParPathArg = Args.getLastArgValue(options::OPT_hipstdpar_path_EQ); HasHIPStdParLibrary = !HIPStdParPathArg.empty() && D.getVFS().exists(HIPStdParPathArg + "/hipstdpar_lib.hpp"); HIPRocThrustPathArg = - Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_thrust_path_EQ); + Args.getLastArgValue(options::OPT_hipstdpar_thrust_path_EQ); HasRocThrustLibrary = !HIPRocThrustPathArg.empty() && D.getVFS().exists(HIPRocThrustPathArg + "/thrust"); - HIPRocPrimPathArg = - Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_prim_path_EQ); + HIPRocPrimPathArg = Args.getLastArgValue(options::OPT_hipstdpar_prim_path_EQ); HasRocPrimLibrary = !HIPRocPrimPathArg.empty() && D.getVFS().exists(HIPRocPrimPathArg + "/rocprim"); - if (auto *A = Args.getLastArg(clang::driver::options::OPT_hip_version_EQ)) { + if (auto *A = Args.getLastArg(options::OPT_hip_version_EQ)) { HIPVersionArg = A->getValue(); unsigned Major = ~0U; unsigned Minor = ~0U; diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index e90a5736911e4..7b999c311154f 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -11,9 +11,9 @@ #include "Gnu.h" #include "clang/Basic/TargetID.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/SmallString.h" #include "llvm/TargetParser/TargetParser.h" diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index 2b41d54a9eb73..e14bc574d139a 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -10,8 +10,8 @@ #include "AMDGPU.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" +#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" using namespace clang::driver; diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp index 731076d9754a9..588255dc5a0cd 100644 --- a/clang/lib/Driver/ToolChains/AVR.cpp +++ b/clang/lib/Driver/ToolChains/AVR.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp index e8d5e38a9064f..d6fb2a57539ed 100644 --- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp +++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp @@ -9,7 +9,7 @@ #include "AArch64.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/TargetParser/Host.h" @@ -222,7 +222,7 @@ void aarch64::getAArch64TargetFeatures(const Driver &D, // Default to 'A' profile if the architecture is not specified. success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions); - if (success && (A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ))) + if (success && (A = Args.getLastArg(options::OPT_mtune_EQ))) success = getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args, Features); else if (success && (A = Args.getLastArg(options::OPT_mcpu_EQ))) diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp index 61beb0455147d..55eb2dcf7ddf4 100644 --- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp +++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp @@ -8,7 +8,7 @@ #include "ARM.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/ARMTargetParser.h" @@ -74,7 +74,7 @@ bool arm::isARMEABIBareMetal(const llvm::Triple &Triple) { // Get Arch/CPU from args. void arm::getARMArchCPUFromArgs(const ArgList &Args, llvm::StringRef &Arch, llvm::StringRef &CPU, bool FromAs) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) + if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) CPU = A->getValue(); if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) Arch = A->getValue(); diff --git a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp index 2fd2c72147f5b..65f6534e4d038 100644 --- a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp +++ b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp @@ -8,7 +8,7 @@ #include "CSKY.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/CSKYTargetParser.h" @@ -33,7 +33,7 @@ csky::getCSKYArchName(const Driver &D, const ArgList &Args, return std::optional(A->getValue()); } - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { llvm::CSKY::ArchKind ArchKind = llvm::CSKY::parseCPUArch(A->getValue()); if (ArchKind == llvm::CSKY::ArchKind::INVALID) { D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args); @@ -126,7 +126,7 @@ void csky::getCSKYTargetFeatures(const Driver &D, const llvm::Triple &Triple, archName = A->getValue(); } - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { llvm::CSKY::ArchKind Kind = llvm::CSKY::parseCPUArch(A->getValue()); if (Kind == llvm::CSKY::ArchKind::INVALID) { D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args); diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp index 156ea03045569..da084bdabaee3 100644 --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp @@ -11,7 +11,7 @@ #include "clang/Basic/DiagnosticDriver.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/TargetParser/Host.h" #include "llvm/TargetParser/LoongArchTargetParser.h" @@ -130,8 +130,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D, const ArgList &Args, std::vector &Features) { // Enable the `lsx` feature on 64-bit LoongArch by default. - if (Triple.isLoongArch64() && - (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ))) + if (Triple.isLoongArch64() && (!Args.hasArgNoClaim(options::OPT_march_EQ))) Features.push_back("+lsx"); // -mrelax is default, unless -mno-relax is specified. diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp index 708ec84a37cfb..a620597f10475 100644 --- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp +++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp @@ -8,7 +8,7 @@ #include "M68k.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Regex.h" @@ -21,7 +21,7 @@ using namespace llvm::opt; /// getM68kTargetCPU - Get the (LLVM) name of the 68000 cpu we are targeting. std::string m68k::getM68kTargetCPU(const ArgList &Args) { - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { + if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { // The canonical CPU name is captalize. However, we allow // starting with lower case or numbers only StringRef CPUName = A->getValue(); @@ -45,17 +45,17 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) { .Default(CPUName.str()); } // FIXME: Throw error when multiple sub-architecture flag exist - if (Args.hasArg(clang::driver::options::OPT_m68000)) + if (Args.hasArg(options::OPT_m68000)) return "M68000"; - if (Args.hasArg(clang::driver::options::OPT_m68010)) + if (Args.hasArg(options::OPT_m68010)) return "M68010"; - if (Args.hasArg(clang::driver::options::OPT_m68020)) + if (Args.hasArg(options::OPT_m68020)) return "M68020"; - if (Args.hasArg(clang::driver::options::OPT_m68030)) + if (Args.hasArg(options::OPT_m68030)) return "M68030"; - if (Args.hasArg(clang::driver::options::OPT_m68040)) + if (Args.hasArg(options::OPT_m68040)) return "M68040"; - if (Args.hasArg(clang::driver::options::OPT_m68060)) + if (Args.hasArg(options::OPT_m68060)) return "M68060"; return ""; diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp index 8d7b85dbeed99..103aae7018fbf 100644 --- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp @@ -9,7 +9,7 @@ #include "Mips.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" @@ -49,8 +49,7 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple, DefMips64CPU = "mips3"; } - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ, - options::OPT_mcpu_EQ)) + if (Arg *A = Args.getLastArg(options::OPT_march_EQ, options::OPT_mcpu_EQ)) CPUName = A->getValue(); if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) { diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp index 361a68a892a8f..44afdd249fea5 100644 --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp @@ -9,7 +9,7 @@ #include "PPC.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp index f2e79e71f93d4..1dcce6d053a39 100644 --- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp +++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp @@ -10,7 +10,7 @@ #include "../Clang.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Error.h" #include "llvm/TargetParser/Host.h" diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp index 94a94f1e9c487..49256d80cbdf6 100644 --- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp @@ -8,7 +8,7 @@ #include "Sparc.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" @@ -122,7 +122,7 @@ sparc::FloatABI sparc::getSparcFloatABI(const Driver &D, std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args, const llvm::Triple &Triple) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { StringRef CPUName = A->getValue(); if (CPUName == "native") { std::string CPU = std::string(llvm::sys::getHostCPUName()); diff --git a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp index 75b6afd925245..1ef6a725483e8 100644 --- a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp +++ b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp @@ -8,7 +8,7 @@ #include "SystemZ.h" #include "clang/Config/config.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" @@ -25,9 +25,9 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D, D.Diag(diag::err_drv_unsupported_opt) << Args.getLastArg(options::OPT_mfloat_abi_EQ)->getAsString(Args); - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_msoft_float, - options::OPT_mhard_float)) - if (A->getOption().matches(clang::driver::options::OPT_msoft_float)) + if (Arg *A = + Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float)) + if (A->getOption().matches(options::OPT_msoft_float)) ABI = systemz::FloatABI::Soft; return ABI; @@ -35,7 +35,7 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D, std::string systemz::getSystemZTargetCPU(const ArgList &Args, const llvm::Triple &T) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { llvm::StringRef CPUName = A->getValue(); if (CPUName == "native") { diff --git a/clang/lib/Driver/ToolChains/Arch/VE.cpp b/clang/lib/Driver/ToolChains/Arch/VE.cpp index adc0873586588..c8353d7dc5f3a 100644 --- a/clang/lib/Driver/ToolChains/Arch/VE.cpp +++ b/clang/lib/Driver/ToolChains/Arch/VE.cpp @@ -8,7 +8,7 @@ #include "VE.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" using namespace clang::driver; diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp index 1373905a5120e..092069b6ade56 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp @@ -8,7 +8,7 @@ #include "X86.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/Option/ArgList.h" @@ -21,7 +21,7 @@ using namespace llvm::opt; std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args, const llvm::Triple &Triple) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { StringRef CPU = A->getValue(); if (CPU != "native") return std::string(CPU); @@ -119,7 +119,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, std::vector &Features) { // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or // "ms_abi" as default function attributes. - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) { StringRef DefaultAbi = (Triple.isOSWindows() || Triple.isUEFI()) ? "ms" : "sysv"; if (A->getValue() != DefaultAbi) @@ -128,7 +128,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, } // If -march=native, autodetect the feature list. - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { if (StringRef(A->getValue()) == "native") { for (auto &F : llvm::sys::getHostCPUFeatures()) Features.push_back( @@ -163,7 +163,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, // flags). This is a bit hacky but keeps existing usages working. We should // consider deprecating this and instead warn if the user requests external // retpoline thunks and *doesn't* request some form of retpolines. - auto SpectreOpt = clang::driver::options::ID::OPT_INVALID; + auto SpectreOpt = options::ID::OPT_INVALID; if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline, options::OPT_mspeculative_load_hardening, options::OPT_mno_speculative_load_hardening)) { @@ -189,7 +189,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, SpectreOpt = options::OPT_mretpoline_external_thunk; } - auto LVIOpt = clang::driver::options::ID::OPT_INVALID; + auto LVIOpt = options::ID::OPT_INVALID; if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening, false)) { Features.push_back("+lvi-load-hardening"); @@ -207,7 +207,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, << D.getOpts().getOptionName(options::OPT_mlvi_hardening) << D.getOpts().getOptionName(options::OPT_m_seses); - if (SpectreOpt != clang::driver::options::ID::OPT_INVALID) + if (SpectreOpt != options::ID::OPT_INVALID) D.Diag(diag::err_drv_argument_not_allowed_with) << D.getOpts().getOptionName(SpectreOpt) << D.getOpts().getOptionName(options::OPT_m_seses); @@ -219,8 +219,8 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, } } - if (SpectreOpt != clang::driver::options::ID::OPT_INVALID && - LVIOpt != clang::driver::options::ID::OPT_INVALID) { + if (SpectreOpt != options::ID::OPT_INVALID && + LVIOpt != options::ID::OPT_INVALID) { D.Diag(diag::err_drv_argument_not_allowed_with) << D.getOpts().getOptionName(SpectreOpt) << D.getOpts().getOptionName(LVIOpt); diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp index 9b7f58c392885..8d598be9ffb0a 100644 --- a/clang/lib/Driver/ToolChains/BareMetal.cpp +++ b/clang/lib/Driver/ToolChains/BareMetal.cpp @@ -18,7 +18,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/MultilibBuilder.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -135,7 +135,7 @@ static std::string computeClangRuntimesSysRoot(const Driver &D, bool BareMetal::initGCCInstallation(const llvm::Triple &Triple, const llvm::opt::ArgList &Args) { if (Args.getLastArg(options::OPT_gcc_toolchain) || - Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) { + Args.getLastArg(clang::options::OPT_gcc_install_dir_EQ)) { GCCInstallation.init(Triple, Args); return GCCInstallation.isValid(); } diff --git a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp index e4db3307ee3aa..c561d7d38da5b 100644 --- a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp +++ b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index e20963ac288d8..2791b1e57877e 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -29,10 +29,10 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Distro.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/Types.h" #include "clang/Driver/XRayArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" @@ -65,7 +65,7 @@ using namespace clang; using namespace llvm::opt; static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) { - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC, + if (Arg *A = Args.getLastArg(options::OPT_C, options::OPT_CC, options::OPT_fminimize_whitespace, options::OPT_fno_minimize_whitespace, options::OPT_fkeep_system_includes, @@ -1661,7 +1661,7 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args, AddAAPCSVolatileBitfieldArgs(Args, CmdArgs); - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { CmdArgs.push_back("-tune-cpu"); if (strcmp(A->getValue(), "native") == 0) CmdArgs.push_back(Args.MakeArgString(llvm::sys::getHostCPUName())); @@ -2067,7 +2067,7 @@ void Clang::AddSparcTargetArgs(const ArgList &Args, CmdArgs.push_back("hard"); } - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { StringRef Name = A->getValue(); std::string TuneCPU; if (Name == "native") @@ -2173,12 +2173,11 @@ void Clang::AddX86TargetArgs(const ArgList &Args, // Default to "generic" unless -march is present or targetting the PS4/PS5. std::string TuneCPU; - if (!Args.hasArg(clang::driver::options::OPT_march_EQ) && - !getToolChain().getTriple().isPS()) + if (!Args.hasArg(options::OPT_march_EQ) && !getToolChain().getTriple().isPS()) TuneCPU = "generic"; // Override based on -mtune. - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { StringRef Name = A->getValue(); if (Name == "native") { diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index ec8dcdc81db56..9e3ca9f281195 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -31,11 +31,11 @@ #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" #include "clang/Driver/XRayArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" @@ -69,8 +69,7 @@ using namespace llvm::opt; static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args, const llvm::Triple &Triple) { - if (Args.hasArg(clang::driver::options::OPT_pg) && - !Args.hasArg(clang::driver::options::OPT_mfentry)) + if (Args.hasArg(options::OPT_pg) && !Args.hasArg(options::OPT_mfentry)) return true; if (Triple.isAndroid()) @@ -249,17 +248,16 @@ getFramePointerKind(const llvm::opt::ArgList &Args, // without requiring new frame records to be created. bool DefaultFP = useFramePointerForTargetByDefault(Args, Triple); - bool EnableFP = - mustUseNonLeafFramePointerForTarget(Triple) || - Args.hasFlag(clang::driver::options::OPT_fno_omit_frame_pointer, - clang::driver::options::OPT_fomit_frame_pointer, DefaultFP); + bool EnableFP = mustUseNonLeafFramePointerForTarget(Triple) || + Args.hasFlag(options::OPT_fno_omit_frame_pointer, + options::OPT_fomit_frame_pointer, DefaultFP); bool DefaultLeafFP = useLeafFramePointerForTargetByDefault(Triple) || (EnableFP && framePointerImpliesLeafFramePointer(Args, Triple)); - bool EnableLeafFP = Args.hasFlag( - clang::driver::options::OPT_mno_omit_leaf_frame_pointer, - clang::driver::options::OPT_momit_leaf_frame_pointer, DefaultLeafFP); + bool EnableLeafFP = + Args.hasFlag(options::OPT_mno_omit_leaf_frame_pointer, + options::OPT_momit_leaf_frame_pointer, DefaultLeafFP); bool FPRegReserved = EnableFP || mustMaintainValidFrameChain(Args, Triple); @@ -753,7 +751,7 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args, case llvm::Triple::ppcle: case llvm::Triple::ppc64: case llvm::Triple::ppc64le: - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) + if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) return std::string( llvm::PPC::getNormalizedPPCTargetCPU(T, A->getValue())); return std::string(llvm::PPC::getNormalizedPPCTargetCPU(T)); @@ -1733,7 +1731,7 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, if (SanArgs.needsFuzzerInterceptors()) addSanitizerRuntime(TC, Args, CmdArgs, "fuzzer_interceptors", false, true); - if (!Args.hasArg(clang::driver::options::OPT_nostdlibxx)) { + if (!Args.hasArg(options::OPT_nostdlibxx)) { bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) && !Args.hasArg(options::OPT_static); if (OnlyLibstdcxxStatic) @@ -3385,7 +3383,7 @@ void tools::handleInterchangeLoopsArgs(const ArgList &Args, // Otherwise, return an empty string and issue a diagnosic message if needed. StringRef tools::parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags, const llvm::opt::ArgList &Args) { - Arg *A = Args.getLastArg(clang::driver::options::OPT_mprefer_vector_width_EQ); + Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ); if (!A) return ""; diff --git a/clang/lib/Driver/ToolChains/CrossWindows.cpp b/clang/lib/Driver/ToolChains/CrossWindows.cpp index 51c892fc91718..6df5315e8fff8 100644 --- a/clang/lib/Driver/ToolChains/CrossWindows.cpp +++ b/clang/lib/Driver/ToolChains/CrossWindows.cpp @@ -10,8 +10,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 07201cc4676ac..6cc73ff5fc1f6 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -14,7 +14,7 @@ #include "clang/Driver/Distro.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE #include "llvm/Option/ArgList.h" @@ -153,16 +153,16 @@ CudaInstallationDetector::CudaInstallationDetector( std::initializer_list Versions = {"8.0", "7.5", "7.0"}; auto &FS = D.getVFS(); - if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) { + if (Args.hasArg(options::OPT_cuda_path_EQ)) { Candidates.emplace_back( - Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str()); + Args.getLastArgValue(options::OPT_cuda_path_EQ).str()); } else if (HostTriple.isOSWindows()) { for (const char *Ver : Versions) Candidates.emplace_back( D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" + Ver); } else { - if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) { + if (!Args.hasArg(options::OPT_cuda_path_ignore_env)) { // Try to find ptxas binary. If the executable is located in a directory // called 'bin/', its parent directory might be a good guess for a valid // CUDA installation. diff --git a/clang/lib/Driver/ToolChains/Cygwin.cpp b/clang/lib/Driver/ToolChains/Cygwin.cpp index d9c16347daa34..55438125ce0f1 100644 --- a/clang/lib/Driver/ToolChains/Cygwin.cpp +++ b/clang/lib/Driver/ToolChains/Cygwin.cpp @@ -10,7 +10,7 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -58,7 +58,7 @@ void Cygwin::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index 2fb7652d64536..fc3cd9030f71d 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -14,8 +14,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" @@ -1079,7 +1079,7 @@ StringRef MachO::getMachOArchName(const ArgList &Args) const { case llvm::Triple::thumb: case llvm::Triple::arm: - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) + if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) if (const char *Arch = ArmMachOArchName(A->getValue())) return Arch; @@ -2993,7 +2993,7 @@ DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args, if (!BoundArch.empty()) { StringRef Name = BoundArch; const Option MCpu = Opts.getOption(options::OPT_mcpu_EQ); - const Option MArch = Opts.getOption(clang::driver::options::OPT_march_EQ); + const Option MArch = Opts.getOption(options::OPT_march_EQ); // This code must be kept in sync with LLVM's getArchTypeForDarwinArch, // which defines the list of which architectures we accept. diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp index 524f5f2ff391e..d4a6d6ae3e349 100644 --- a/clang/lib/Driver/ToolChains/DragonFly.cpp +++ b/clang/lib/Driver/ToolChains/DragonFly.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -219,7 +219,7 @@ void DragonFly::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 88bce181d40d2..038395e4b68f2 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -11,7 +11,7 @@ #include "clang/Basic/CodeGenOptions.h" #include "clang/Driver/CommonArgs.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Frontend/Debug/Options.h" #include "llvm/Support/Path.h" #include "llvm/TargetParser/Host.h" @@ -230,7 +230,7 @@ void Flang::addCodegenOptions(const ArgList &Args, options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays, options::OPT_ftime_report, options::OPT_ftime_report_EQ, options::OPT_funroll_loops, options::OPT_fno_unroll_loops}); - if (Args.hasArg(clang::driver::options::OPT_fcoarray)) + if (Args.hasArg(options::OPT_fcoarray)) CmdArgs.push_back("-fcoarray"); } diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp index b17b76233ad30..70e66a2f5c3e7 100644 --- a/clang/lib/Driver/ToolChains/FreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp @@ -13,8 +13,8 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" @@ -404,7 +404,7 @@ void FreeBSD::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp index 507cc03b27513..9edfc4de3d602 100644 --- a/clang/lib/Driver/ToolChains/Fuchsia.cpp +++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/MultilibBuilder.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -344,7 +344,7 @@ Tool *Fuchsia::buildStaticLibTool() const { ToolChain::RuntimeLibType Fuchsia::GetRuntimeLibType(const ArgList &Args) const { - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) { + if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) { StringRef Value = A->getValue(); if (Value != "compiler-rt") getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 2dd8cc8422cd7..1bfcd1f4f3a7c 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -20,9 +20,9 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/MultilibBuilder.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/Twine.h" #include "llvm/Option/ArgList.h" @@ -2058,7 +2058,7 @@ Generic_GCC::GCCVersion Generic_GCC::GCCVersion::Parse(StringRef VersionText) { static llvm::StringRef getGCCToolchainDir(const ArgList &Args, llvm::StringRef SysRoot) { - const Arg *A = Args.getLastArg(clang::driver::options::OPT_gcc_toolchain); + const Arg *A = Args.getLastArg(options::OPT_gcc_toolchain); if (A) return A->getValue(); @@ -2111,8 +2111,7 @@ void Generic_GCC::GCCInstallationDetector::init( CandidateBiarchTripleAliases); // If --gcc-install-dir= is specified, skip filesystem detection. - if (const Arg *A = - Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ); + if (const Arg *A = Args.getLastArg(options::OPT_gcc_install_dir_EQ); A && A->getValue()[0]) { StringRef InstallDir = A->getValue(); if (!ScanGCCForMultilibs(TargetTriple, Args, InstallDir, false)) { @@ -2135,8 +2134,7 @@ void Generic_GCC::GCCInstallationDetector::init( // If --gcc-triple is specified use this instead of trying to // auto-detect a triple. - if (const Arg *A = - Args.getLastArg(clang::driver::options::OPT_gcc_triple_EQ)) { + if (const Arg *A = Args.getLastArg(options::OPT_gcc_triple_EQ)) { StringRef GCCTriple = A->getValue(); CandidateTripleAliases.clear(); CandidateTripleAliases.push_back(GCCTriple); diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index c0c8afec07264..0fbfa090ed9d3 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -15,8 +15,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/TargetParser/TargetParser.h" diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp index bce7f46dea468..be0f49d8e1497 100644 --- a/clang/lib/Driver/ToolChains/HIPSPV.cpp +++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp @@ -12,7 +12,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -211,7 +211,7 @@ HIPSPVToolChain::getDeviceLibs( // Find device libraries in --hip-device-lib-path and HIP_DEVICE_LIB_PATH. auto HipDeviceLibPathArgs = DriverArgs.getAllArgValues( // --hip-device-lib-path is alias to this option. - clang::driver::options::OPT_rocm_device_lib_path_EQ); + options::OPT_rocm_device_lib_path_EQ); for (auto Path : HipDeviceLibPathArgs) LibraryPaths.push_back(DriverArgs.MakeArgString(Path)); diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp index 732403e69a075..1af2ae6470f1e 100644 --- a/clang/lib/Driver/ToolChains/HIPUtility.cpp +++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp @@ -9,7 +9,7 @@ #include "HIPUtility.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/Archive.h" diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp index 9f8b676fc7dc2..084f51721315c 100644 --- a/clang/lib/Driver/ToolChains/Hexagon.cpp +++ b/clang/lib/Driver/ToolChains/Hexagon.cpp @@ -11,7 +11,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Hurd.cpp b/clang/lib/Driver/ToolChains/Hurd.cpp index 43121233ea7d0..53ee4d4c0cbde 100644 --- a/clang/lib/Driver/ToolChains/Hurd.cpp +++ b/clang/lib/Driver/ToolChains/Hurd.cpp @@ -10,7 +10,7 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -168,7 +168,7 @@ void Hurd::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index 94a9fe8b1a63f..020e7465548fe 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -16,8 +16,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Distro.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Path.h" @@ -731,7 +731,7 @@ void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; // Add 'include' in the resource directory, which is similar to diff --git a/clang/lib/Driver/ToolChains/MSP430.cpp b/clang/lib/Driver/ToolChains/MSP430.cpp index 9eca1ad5f2865..3cc56bb7e832e 100644 --- a/clang/lib/Driver/ToolChains/MSP430.cpp +++ b/clang/lib/Driver/ToolChains/MSP430.cpp @@ -12,7 +12,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Multilib.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index bb469ff095cd4..fcae5b7a18f34 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/ConvertUTF.h" diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp index da4a9072317f4..1bbabdfc631b8 100644 --- a/clang/lib/Driver/ToolChains/Managarm.cpp +++ b/clang/lib/Driver/ToolChains/Managarm.cpp @@ -11,8 +11,8 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -136,7 +136,7 @@ void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp index 1bb9bcfe6aab2..2c9a174069f70 100644 --- a/clang/lib/Driver/ToolChains/MinGW.cpp +++ b/clang/lib/Driver/ToolChains/MinGW.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" diff --git a/clang/lib/Driver/ToolChains/MipsLinux.cpp b/clang/lib/Driver/ToolChains/MipsLinux.cpp index 7dd3936613296..58d6b5031f536 100644 --- a/clang/lib/Driver/ToolChains/MipsLinux.cpp +++ b/clang/lib/Driver/ToolChains/MipsLinux.cpp @@ -9,7 +9,7 @@ #include "MipsLinux.h" #include "Arch/Mips.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -38,7 +38,7 @@ MipsLLVMToolChain::MipsLLVMToolChain(const Driver &D, void MipsLLVMToolChain::AddClangSystemIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; const Driver &D = getDriver(); diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp index 8db00deeb80df..ea722b59853d6 100644 --- a/clang/lib/Driver/ToolChains/NetBSD.cpp +++ b/clang/lib/Driver/ToolChains/NetBSD.cpp @@ -14,8 +14,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" @@ -466,7 +466,7 @@ void NetBSD::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp index 00991504e97a8..607eb714f85dc 100644 --- a/clang/lib/Driver/ToolChains/OHOS.cpp +++ b/clang/lib/Driver/ToolChains/OHOS.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/FileSystem.h" @@ -174,7 +174,7 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) ToolChain::RuntimeLibType OHOS::GetRuntimeLibType( const ArgList &Args) const { - if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) { + if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) { StringRef Value = A->getValue(); if (Value != "compiler-rt") getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name) diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp index 8f589186af343..5e7b4f1a664e6 100644 --- a/clang/lib/Driver/ToolChains/OpenBSD.cpp +++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp @@ -13,8 +13,8 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -315,7 +315,7 @@ void OpenBSD::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp index 8d381c4f14371..76180431ee682 100644 --- a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp @@ -8,7 +8,7 @@ #include "PPCFreeBSD.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/Path.h" using namespace clang::driver::toolchains; @@ -16,7 +16,7 @@ using namespace llvm::opt; void PPCFreeBSDToolChain::AddClangSystemIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) && + if (!DriverArgs.hasArg(options::OPT_nostdinc) && !DriverArgs.hasArg(options::OPT_nobuiltininc)) { const Driver &D = getDriver(); SmallString<128> P(D.ResourceDir); diff --git a/clang/lib/Driver/ToolChains/PPCLinux.cpp b/clang/lib/Driver/ToolChains/PPCLinux.cpp index 768214e416bd7..672ebd5b7b98d 100644 --- a/clang/lib/Driver/ToolChains/PPCLinux.cpp +++ b/clang/lib/Driver/ToolChains/PPCLinux.cpp @@ -8,7 +8,7 @@ #include "PPCLinux.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -58,7 +58,7 @@ PPCLinuxToolChain::PPCLinuxToolChain(const Driver &D, void PPCLinuxToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) && + if (!DriverArgs.hasArg(options::OPT_nostdinc) && !DriverArgs.hasArg(options::OPT_nobuiltininc)) { const Driver &D = getDriver(); SmallString<128> P(D.ResourceDir); diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp index 34ec65ae59602..6fe18aa4cceba 100644 --- a/clang/lib/Driver/ToolChains/PS4CPU.cpp +++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp @@ -11,8 +11,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/SPIRV.cpp b/clang/lib/Driver/ToolChains/SPIRV.cpp index ea824dbad54cb..27de55cfebfc1 100644 --- a/clang/lib/Driver/ToolChains/SPIRV.cpp +++ b/clang/lib/Driver/ToolChains/SPIRV.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" using namespace clang::driver; using namespace clang::driver::toolchains; diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 0232b047a6c4b..85859f344b491 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -20,7 +20,7 @@ SYCLInstallationDetector::SYCLInstallationDetector( void SYCLInstallationDetector::addSYCLIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nobuiltininc)) + if (DriverArgs.hasArg(options::OPT_nobuiltininc)) return; // Add the SYCL header search locations in the specified order. diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp index 64c7d1ceb3a36..ad0f41144f393 100644 --- a/clang/lib/Driver/ToolChains/Solaris.cpp +++ b/clang/lib/Driver/ToolChains/Solaris.cpp @@ -13,9 +13,9 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" @@ -360,7 +360,7 @@ void Solaris::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/UEFI.cpp b/clang/lib/Driver/ToolChains/UEFI.cpp index d2be147c7b9f6..7732e37f8061d 100644 --- a/clang/lib/Driver/ToolChains/UEFI.cpp +++ b/clang/lib/Driver/ToolChains/UEFI.cpp @@ -11,8 +11,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" +#include "clang/Options/Options.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" diff --git a/clang/lib/Driver/ToolChains/VEToolchain.cpp b/clang/lib/Driver/ToolChains/VEToolchain.cpp index ad9129046c3e1..78509bcdae0fe 100644 --- a/clang/lib/Driver/ToolChains/VEToolchain.cpp +++ b/clang/lib/Driver/ToolChains/VEToolchain.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" #include // ::getenv @@ -78,7 +78,7 @@ bool VEToolChain::hasBlocksRuntime() const { return false; } void VEToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; if (DriverArgs.hasArg(options::OPT_nobuiltininc) && @@ -117,7 +117,7 @@ void VEToolChain::addClangTargetOptions(const ArgList &DriverArgs, void VEToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) || + if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp index 5054868b5ff4d..15c6f19e87fee 100644 --- a/clang/lib/Driver/ToolChains/WebAssembly.cpp +++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp @@ -12,7 +12,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_STRING #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" @@ -297,7 +297,7 @@ bool WebAssembly::HasNativeLLVMSupport() const { return true; } void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs, ArgStringList &CC1Args, Action::OffloadKind) const { - if (!DriverArgs.hasFlag(clang::driver::options::OPT_fuse_init_array, + if (!DriverArgs.hasFlag(options::OPT_fuse_init_array, options::OPT_fno_use_init_array, true)) CC1Args.push_back("-fno-use-init-array"); @@ -472,7 +472,7 @@ WebAssembly::GetCXXStdlibType(const ArgList &Args) const { void WebAssembly::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) + if (DriverArgs.hasArg(options::OPT_nostdinc)) return; const Driver &D = getDriver(); diff --git a/clang/lib/Driver/ToolChains/XCore.cpp b/clang/lib/Driver/ToolChains/XCore.cpp index 6a2a75cb99739..dd26c11affffb 100644 --- a/clang/lib/Driver/ToolChains/XCore.cpp +++ b/clang/lib/Driver/ToolChains/XCore.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include // ::getenv @@ -113,7 +113,7 @@ bool XCoreToolChain::hasBlocksRuntime() const { return false; } void XCoreToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) || + if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc)) return; if (const char *cl_include_dir = getenv("XCC_C_INCLUDE_PATH")) { @@ -137,7 +137,7 @@ void XCoreToolChain::addClangTargetOptions(const ArgList &DriverArgs, void XCoreToolChain::AddClangCXXStdlibIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) || + if (DriverArgs.hasArg(options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp index 9a3c45323a3cf..eac8f623f9a50 100644 --- a/clang/lib/Driver/ToolChains/ZOS.cpp +++ b/clang/lib/Driver/ToolChains/ZOS.cpp @@ -9,7 +9,7 @@ #include "ZOS.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/WithColor.h" diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp index 0325296f84b19..4c2d11751a363 100644 --- a/clang/lib/Driver/XRayArgs.cpp +++ b/clang/lib/Driver/XRayArgs.cpp @@ -8,8 +8,8 @@ #include "clang/Driver/XRayArgs.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" +#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/SpecialCaseList.h" diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt index a916667208845..dac9e0d26f393 100644 --- a/clang/lib/Frontend/CMakeLists.txt +++ b/clang/lib/Frontend/CMakeLists.txt @@ -52,6 +52,7 @@ add_clang_library(clangFrontend clangAST clangBasic clangDriver + clangOptions clangEdit clangLex clangParse diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index be7c1d367e082..f584a2a5824b2 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -27,7 +27,6 @@ #include "clang/Basic/XRayInstr.h" #include "clang/Config/config.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CommandLineSourceLoc.h" #include "clang/Frontend/DependencyOutputOptions.h" #include "clang/Frontend/FrontendDiagnostic.h" @@ -38,6 +37,7 @@ #include "clang/Frontend/Utils.h" #include "clang/Lex/HeaderSearchOptions.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Options/Options.h" #include "clang/Serialization/ASTBitCodes.h" #include "clang/Serialization/ModuleFileExtension.h" #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h" @@ -255,7 +255,7 @@ CowCompilerInvocation::getMutPreprocessorOutputOpts() { using ArgumentConsumer = CompilerInvocation::ArgumentConsumer; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_STR_TABLE_CODE static llvm::StringRef lookupStrInTable(unsigned Offset) { @@ -263,7 +263,7 @@ static llvm::StringRef lookupStrInTable(unsigned Offset) { } #define SIMPLE_ENUM_VALUE_TABLE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef SIMPLE_ENUM_VALUE_TABLE static std::optional normalizeSimpleFlag(OptSpecifier Opt, @@ -981,7 +981,7 @@ static void GenerateAnalyzerArgs(const AnalyzerOptions &Opts, #define ANALYZER_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef ANALYZER_OPTION_WITH_MARSHALLING if (Opts.AnalysisConstraintsOpt != RangeConstraintsModel) { @@ -1068,7 +1068,7 @@ static bool ParseAnalyzerArgs(AnalyzerOptions &Opts, ArgList &Args, #define ANALYZER_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef ANALYZER_OPTION_WITH_MARSHALLING if (Arg *A = Args.getLastArg(OPT_analyzer_constraints)) { @@ -1575,7 +1575,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, #define CODEGEN_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef CODEGEN_OPTION_WITH_MARSHALLING if (Opts.OptimizationLevel > 0) { @@ -1880,7 +1880,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, #define CODEGEN_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef CODEGEN_OPTION_WITH_MARSHALLING // At O0 we want to fully disable inlining outside of cases marked with @@ -2371,7 +2371,7 @@ static void GenerateDependencyOutputArgs(const DependencyOutputOptions &Opts, const DependencyOutputOptions &DependencyOutputOpts = Opts; #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING if (Opts.ShowIncludesDest != ShowIncludesDestination::None) @@ -2406,7 +2406,7 @@ static bool ParseDependencyOutputArgs(DependencyOutputOptions &Opts, DependencyOutputOptions &DependencyOutputOpts = Opts; #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING if (Args.hasArg(OPT_show_includes)) { @@ -2534,7 +2534,7 @@ static void GenerateFileSystemArgs(const FileSystemOptions &Opts, #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING } @@ -2546,7 +2546,7 @@ static bool ParseFileSystemArgs(FileSystemOptions &Opts, const ArgList &Args, #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING return Diags.getNumErrors() == NumErrorsBefore; @@ -2557,7 +2557,7 @@ static void GenerateMigratorArgs(const MigratorOptions &Opts, const MigratorOptions &MigratorOpts = Opts; #define MIGRATOR_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef MIGRATOR_OPTION_WITH_MARSHALLING } @@ -2569,7 +2569,7 @@ static bool ParseMigratorArgs(MigratorOptions &Opts, const ArgList &Args, #define MIGRATOR_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef MIGRATOR_OPTION_WITH_MARSHALLING return Diags.getNumErrors() == NumErrorsBefore; @@ -2581,7 +2581,7 @@ void CompilerInvocationBase::GenerateDiagnosticArgs( const DiagnosticOptions *DiagnosticOpts = &Opts; #define DIAG_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef DIAG_OPTION_WITH_MARSHALLING if (!Opts.DiagnosticSerializationFile.empty()) @@ -2686,7 +2686,7 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args, #define DIAG_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, *Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef DIAG_OPTION_WITH_MARSHALLING llvm::sys::Process::UseANSIEscapeCodes(Opts.UseANSIEscapeCodes); @@ -2836,7 +2836,7 @@ static void GenerateFrontendArgs(const FrontendOptions &Opts, const FrontendOptions &FrontendOpts = Opts; #define FRONTEND_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef FRONTEND_OPTION_WITH_MARSHALLING std::optional ProgramActionOpt = @@ -3006,7 +3006,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args, #define FRONTEND_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef FRONTEND_OPTION_WITH_MARSHALLING Opts.ProgramAction = frontend::ParseSyntaxOnly; @@ -3288,7 +3288,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts, const HeaderSearchOptions *HeaderSearchOpts = &Opts; #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING if (Opts.UseLibcxx) @@ -3403,7 +3403,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args, #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING if (const Arg *A = Args.getLastArg(OPT_stdlib_EQ)) @@ -3736,7 +3736,7 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, #define LANG_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // The '-fcf-protection=' option is generated by CodeGenOpts generator. @@ -4082,7 +4082,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, #define LANG_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) { @@ -4745,7 +4745,7 @@ static void GeneratePreprocessorArgs(const PreprocessorOptions &Opts, #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef PREPROCESSOR_OPTION_WITH_MARSHALLING if (Opts.PCHWithHdrStop && !Opts.PCHWithHdrStopCreate) @@ -4819,7 +4819,7 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args, #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef PREPROCESSOR_OPTION_WITH_MARSHALLING Opts.PCHWithHdrStop = Args.hasArg(OPT_pch_through_hdrstop_create) || @@ -4912,7 +4912,7 @@ GeneratePreprocessorOutputArgs(const PreprocessorOutputOptions &Opts, #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING bool Generate_dM = isStrictlyPreprocessorAction(Action) && !Opts.ShowCPP; @@ -4933,7 +4933,7 @@ static bool ParsePreprocessorOutputArgs(PreprocessorOutputOptions &Opts, #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING Opts.ShowCPP = isStrictlyPreprocessorAction(Action) && !Args.hasArg(OPT_dM); @@ -4948,7 +4948,7 @@ static void GenerateTargetArgs(const TargetOptions &Opts, const TargetOptions *TargetOpts = &Opts; #define TARGET_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef TARGET_OPTION_WITH_MARSHALLING if (!Opts.SDKVersion.empty()) @@ -4967,7 +4967,7 @@ static bool ParseTargetArgs(TargetOptions &Opts, ArgList &Args, #define TARGET_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef TARGET_OPTION_WITH_MARSHALLING if (Arg *A = Args.getLastArg(options::OPT_target_sdk_version_EQ)) { diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp index 99212b81fe064..73b81ed906808 100644 --- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp +++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp @@ -14,11 +14,11 @@ #include "clang/Driver/Action.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/Utils.h" +#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Option/ArgList.h" @@ -61,11 +61,11 @@ clang::createInvocation(ArrayRef ArgList, if (!C) return nullptr; - if (C->getArgs().hasArg(driver::options::OPT_fdriver_only)) + if (C->getArgs().hasArg(options::OPT_fdriver_only)) return nullptr; // Just print the cc1 options if -### was present. - if (C->getArgs().hasArg(driver::options::OPT__HASH_HASH_HASH)) { + if (C->getArgs().hasArg(options::OPT__HASH_HASH_HASH)) { C->getJobs().Print(llvm::errs(), "\n", true); return nullptr; } diff --git a/clang/lib/FrontendTool/CMakeLists.txt b/clang/lib/FrontendTool/CMakeLists.txt index 061e54c3e62d0..66213f76eb968 100644 --- a/clang/lib/FrontendTool/CMakeLists.txt +++ b/clang/lib/FrontendTool/CMakeLists.txt @@ -7,6 +7,7 @@ set(link_libs clangBasic clangCodeGen clangDriver + clangOptions clangExtractAPI clangFrontend clangRewriteFrontend diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index c8aad4daa1c10..e571193c6a9c8 100644 --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -13,7 +13,6 @@ #include "clang/CodeGen/CodeGenAction.h" #include "clang/Config/config.h" -#include "clang/Driver/Options.h" #include "clang/ExtractAPI/FrontendActions.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/CompilerInvocation.h" @@ -22,6 +21,7 @@ #include "clang/Frontend/FrontendPluginRegistry.h" #include "clang/Frontend/Utils.h" #include "clang/FrontendTool/Utils.h" +#include "clang/Options/Options.h" #include "clang/Rewrite/Frontend/FrontendActions.h" #include "clang/StaticAnalyzer/Frontend/AnalyzerHelpFlags.h" #include "clang/StaticAnalyzer/Frontend/FrontendActions.h" @@ -215,11 +215,11 @@ bool ExecuteCompilerInvocation(CompilerInstance *Clang) { // Honor -help. if (Clang->getFrontendOpts().ShowHelp) { - driver::getDriverOptTable().printHelp( + getDriverOptTable().printHelp( llvm::outs(), "clang -cc1 [options] file...", "LLVM 'Clang' Compiler: http://clang.llvm.org", /*ShowHidden=*/false, /*ShowAllAliases=*/false, - llvm::opt::Visibility(driver::options::CC1Option)); + llvm::opt::Visibility(options::CC1Option)); return true; } diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp index 76338065d2231..7764fa7dc92b9 100644 --- a/clang/lib/Interpreter/Interpreter.cpp +++ b/clang/lib/Interpreter/Interpreter.cpp @@ -33,7 +33,6 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" @@ -43,6 +42,7 @@ #include "clang/Interpreter/Interpreter.h" #include "clang/Interpreter/Value.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Options/Options.h" #include "clang/Sema/Lookup.h" #include "clang/Serialization/ObjectFilePCHContainerReader.h" #include "llvm/ExecutionEngine/JITSymbol.h" @@ -185,7 +185,7 @@ IncrementalCompilerBuilder::create(std::string TT, llvm::ArrayRef RF = llvm::ArrayRef(ClangArgv); std::unique_ptr Compilation(Driver.BuildCompilation(RF)); - if (Compilation->getArgs().hasArg(driver::options::OPT_v)) + if (Compilation->getArgs().hasArg(options::OPT_v)) Compilation->getJobs().Print(llvm::errs(), "\n", /*Quote=*/false); auto ErrOrCC1Args = GetCC1Arguments(&Diags, Compilation.get()); diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h index fbf9814b0d4a7..4efe8b9fbc6cc 100644 --- a/clang/lib/Interpreter/InterpreterUtils.h +++ b/clang/lib/Interpreter/InterpreterUtils.h @@ -21,11 +21,11 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/TextDiagnosticBuffer.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Options/Options.h" #include "clang/Sema/Lookup.h" #include "llvm/IR/Module.h" diff --git a/clang/lib/Options/CMakeLists.txt b/clang/lib/Options/CMakeLists.txt new file mode 100644 index 0000000000000..a762e9918b41c --- /dev/null +++ b/clang/lib/Options/CMakeLists.txt @@ -0,0 +1,18 @@ +set(LLVM_LINK_COMPONENTS + Option + Support +) + +add_clang_library(clangOptions + DriverOptions.cpp + OptionUtils.cpp + + DEPENDS + ClangDriverOptions + # These generated headers are included transitively. + target_parser_gen + + LINK_LIBS + clangBasic + ${system_libs} +) diff --git a/clang/lib/Driver/DriverOptions.cpp b/clang/lib/Options/DriverOptions.cpp similarity index 76% rename from clang/lib/Driver/DriverOptions.cpp rename to clang/lib/Options/DriverOptions.cpp index cde1f8989935b..d91e9291fb2f6 100644 --- a/clang/lib/Driver/DriverOptions.cpp +++ b/clang/lib/Options/DriverOptions.cpp @@ -6,33 +6,32 @@ // //===----------------------------------------------------------------------===// -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/OptTable.h" #include -using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace llvm::opt; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_STR_TABLE_CODE #define OPTTABLE_VALUES_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_VALUES_CODE #define OPTTABLE_PREFIXES_TABLE_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_PREFIXES_TABLE_CODE #define OPTTABLE_PREFIXES_UNION_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_PREFIXES_UNION_CODE static constexpr OptTable::Info InfoTable[] = { #define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__), -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTION }; @@ -44,9 +43,9 @@ class DriverOptTable : public PrecomputedOptTable { : PrecomputedOptTable(OptionStrTable, OptionPrefixesTable, InfoTable, OptionPrefixesUnion) {} }; -} +} // anonymous namespace -const llvm::opt::OptTable &clang::driver::getDriverOptTable() { +const llvm::opt::OptTable &clang::getDriverOptTable() { static DriverOptTable Table; return Table; } diff --git a/clang/lib/Driver/OptionUtils.cpp b/clang/lib/Options/OptionUtils.cpp similarity index 97% rename from clang/lib/Driver/OptionUtils.cpp rename to clang/lib/Options/OptionUtils.cpp index 1f36ffc03cab3..fcafd3c83c6b3 100644 --- a/clang/lib/Driver/OptionUtils.cpp +++ b/clang/lib/Options/OptionUtils.cpp @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// +#include "clang/Options/OptionUtils.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticDriver.h" -#include "clang/Driver/OptionUtils.h" #include "llvm/Option/ArgList.h" using namespace clang; diff --git a/clang/lib/Tooling/CMakeLists.txt b/clang/lib/Tooling/CMakeLists.txt index fc1f1f9f9d367..faaa53276d0e6 100644 --- a/clang/lib/Tooling/CMakeLists.txt +++ b/clang/lib/Tooling/CMakeLists.txt @@ -40,6 +40,7 @@ add_clang_library(clangTooling clangASTMatchers clangBasic clangDriver + clangOptions clangFormat clangFrontend clangLex diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp index 28568426a6c48..e9b72388ae4df 100644 --- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp +++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp @@ -44,8 +44,8 @@ #include "clang/Basic/LangStandard.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Types.h" +#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -164,11 +164,11 @@ struct TransferableCommand { // We parse each argument individually so that we can retain the exact // spelling of each argument; re-rendering is lossy for aliased flags. // E.g. in CL mode, /W4 maps to -Wall. - auto &OptTable = clang::driver::getDriverOptTable(); + auto &OptTable = getDriverOptTable(); if (!OldArgs.empty()) Cmd.CommandLine.emplace_back(OldArgs.front()); for (unsigned Pos = 1; Pos < OldArgs.size();) { - using namespace driver::options; + using namespace options; const unsigned OldPos = Pos; std::unique_ptr Arg(OptTable.ParseOneArg( @@ -296,14 +296,14 @@ struct TransferableCommand { // Try to interpret the argument as a type specifier, e.g. '-x'. std::optional tryParseTypeArg(const llvm::opt::Arg &Arg) { const llvm::opt::Option &Opt = Arg.getOption(); - using namespace driver::options; + using namespace options; if (ClangCLMode) { if (Opt.matches(OPT__SLASH_TC) || Opt.matches(OPT__SLASH_Tc)) return types::TY_C; if (Opt.matches(OPT__SLASH_TP) || Opt.matches(OPT__SLASH_Tp)) return types::TY_CXX; } else { - if (Opt.matches(driver::options::OPT_x)) + if (Opt.matches(options::OPT_x)) return types::lookupTypeForTypeSpecifier(Arg.getValue()); } return std::nullopt; @@ -311,7 +311,7 @@ struct TransferableCommand { // Try to interpret the argument as '-std='. std::optional tryParseStdArg(const llvm::opt::Arg &Arg) { - using namespace driver::options; + using namespace options; if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ)) { // "c++latest" is not a recognized LangStandard, but it's accepted by // the clang driver in CL mode. diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index e8eef5ed9c9fa..1f6a5c94601fc 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -21,7 +21,6 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" -#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" #include "clang/Frontend/ASTUnit.h" @@ -32,6 +31,7 @@ #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Lex/HeaderSearchOptions.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Options/Options.h" #include "clang/Tooling/ArgumentsAdjusters.h" #include "clang/Tooling/CompilationDatabase.h" #include "llvm/ADT/ArrayRef.h" @@ -270,17 +270,15 @@ void addTargetAndModeForProgramName(std::vector &CommandLine, StringRef InvokedAs) { if (CommandLine.empty() || InvokedAs.empty()) return; - const auto &Table = driver::getDriverOptTable(); + const auto &Table = getDriverOptTable(); // --target=X - StringRef TargetOPT = - Table.getOption(driver::options::OPT_target).getPrefixedName(); + StringRef TargetOPT = Table.getOption(options::OPT_target).getPrefixedName(); // -target X StringRef TargetOPTLegacy = - Table.getOption(driver::options::OPT_target_legacy_spelling) - .getPrefixedName(); + Table.getOption(options::OPT_target_legacy_spelling).getPrefixedName(); // --driver-mode=X StringRef DriverModeOPT = - Table.getOption(driver::options::OPT_driver_mode).getPrefixedName(); + Table.getOption(options::OPT_driver_mode).getPrefixedName(); auto TargetMode = driver::ToolChain::getTargetAndModeFromProgramName(InvokedAs); // No need to search for target args if we don't have a target/mode to insert. diff --git a/clang/tools/clang-check/CMakeLists.txt b/clang/tools/clang-check/CMakeLists.txt index 5493aa4237aee..1efcc91fcaec9 100644 --- a/clang/tools/clang-check/CMakeLists.txt +++ b/clang/tools/clang-check/CMakeLists.txt @@ -14,6 +14,7 @@ clang_target_link_libraries(clang-check clangBasic clangDriver clangFrontend + clangOptions clangRewriteFrontend clangSerialization clangStaticAnalyzerFrontend diff --git a/clang/tools/clang-check/ClangCheck.cpp b/clang/tools/clang-check/ClangCheck.cpp index fa6dd06a1ee58..80255c647b98f 100644 --- a/clang/tools/clang-check/ClangCheck.cpp +++ b/clang/tools/clang-check/ClangCheck.cpp @@ -16,9 +16,9 @@ //===----------------------------------------------------------------------===// #include "clang/AST/ASTConsumer.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/ASTConsumers.h" #include "clang/Frontend/CompilerInstance.h" +#include "clang/Options/Options.h" #include "clang/Rewrite/Frontend/FixItRewriter.h" #include "clang/Rewrite/Frontend/FrontendActions.h" #include "clang/StaticAnalyzer/Frontend/FrontendActions.h" @@ -34,8 +34,8 @@ #include "llvm/Support/Signals.h" #include "llvm/Support/TargetSelect.h" -using namespace clang::driver; using namespace clang::tooling; +using namespace clang; using namespace llvm; static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage); diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt index 9c0d9dff7dc7f..54bc80486472f 100644 --- a/clang/tools/clang-installapi/CMakeLists.txt +++ b/clang/tools/clang-installapi/CMakeLists.txt @@ -25,6 +25,7 @@ clang_target_link_libraries(clang-installapi clangAST clangInstallAPI clangBasic + clangOptions clangDriver clangFrontend clangTooling diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp index 4e66485343b89..8bef9690ad855 100644 --- a/clang/tools/clang-installapi/ClangInstallAPI.cpp +++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp @@ -35,7 +35,7 @@ using namespace clang; using namespace clang::installapi; -using namespace clang::driver::options; +using namespace clang::options; using namespace llvm::opt; using namespace llvm::MachO; @@ -71,7 +71,7 @@ static bool runFrontend(StringRef ProgName, Twine Label, bool Verbose, static bool run(ArrayRef Args, const char *ProgName) { // Setup Diagnostics engine. DiagnosticOptions DiagOpts; - const llvm::opt::OptTable &ClangOpts = clang::driver::getDriverOptTable(); + const llvm::opt::OptTable &ClangOpts = getDriverOptTable(); unsigned MissingArgIndex, MissingArgCount; llvm::opt::InputArgList ParsedArgs = ClangOpts.ParseArgs( ArrayRef(Args).slice(1), MissingArgIndex, MissingArgCount); diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp index 64324a3f8b010..f484d6f33ad8f 100644 --- a/clang/tools/clang-installapi/Options.cpp +++ b/clang/tools/clang-installapi/Options.cpp @@ -26,8 +26,6 @@ using namespace llvm; using namespace llvm::opt; using namespace llvm::MachO; -namespace drv = clang::driver::options; - namespace clang { namespace installapi { @@ -109,7 +107,7 @@ getArgListFromJSON(const StringRef Input, llvm::opt::OptTable *Table, bool Options::processDriverOptions(InputArgList &Args) { // Handle inputs. - for (const StringRef Path : Args.getAllArgValues(drv::OPT_INPUT)) { + for (const StringRef Path : Args.getAllArgValues(options::OPT_INPUT)) { // Assume any input that is not a directory is a filelist. // InstallAPI does not accept multiple directories, so retain the last one. if (FM->getOptionalDirectoryRef(Path)) @@ -120,7 +118,7 @@ bool Options::processDriverOptions(InputArgList &Args) { // Handle output. SmallString OutputPath; - if (auto *Arg = Args.getLastArg(drv::OPT_o)) { + if (auto *Arg = Args.getLastArg(options::OPT_o)) { OutputPath = Arg->getValue(); if (OutputPath != "-") FM->makeAbsolutePath(OutputPath); @@ -132,10 +130,10 @@ bool Options::processDriverOptions(InputArgList &Args) { } // Do basic error checking first for mixing -target and -arch options. - auto *ArgArch = Args.getLastArgNoClaim(drv::OPT_arch); - auto *ArgTarget = Args.getLastArgNoClaim(drv::OPT_target); + auto *ArgArch = Args.getLastArgNoClaim(options::OPT_arch); + auto *ArgTarget = Args.getLastArgNoClaim(options::OPT_target); auto *ArgTargetVariant = - Args.getLastArgNoClaim(drv::OPT_darwin_target_variant); + Args.getLastArgNoClaim(options::OPT_darwin_target_variant); if (ArgArch && (ArgTarget || ArgTargetVariant)) { Diags->Report(clang::diag::err_drv_argument_not_allowed_with) << ArgArch->getAsString(Args) @@ -143,7 +141,7 @@ bool Options::processDriverOptions(InputArgList &Args) { return false; } - auto *ArgMinTargetOS = Args.getLastArgNoClaim(drv::OPT_mtargetos_EQ); + auto *ArgMinTargetOS = Args.getLastArgNoClaim(options::OPT_mtargetos_EQ); if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) { Diags->Report(clang::diag::err_drv_cannot_mix_options) << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args); @@ -152,7 +150,7 @@ bool Options::processDriverOptions(InputArgList &Args) { // Capture target triples first. if (ArgTarget) { - for (const Arg *A : Args.filtered(drv::OPT_target)) { + for (const Arg *A : Args.filtered(options::OPT_target)) { A->claim(); llvm::Triple TargetTriple(A->getValue()); Target TAPITarget = Target(TargetTriple); @@ -168,7 +166,7 @@ bool Options::processDriverOptions(InputArgList &Args) { // Capture target variants. DriverOpts.Zippered = ArgTargetVariant != nullptr; - for (Arg *A : Args.filtered(drv::OPT_darwin_target_variant)) { + for (Arg *A : Args.filtered(options::OPT_darwin_target_variant)) { A->claim(); Triple Variant(A->getValue()); if (Variant.getVendor() != Triple::Apple) { @@ -213,7 +211,7 @@ bool Options::processDriverOptions(InputArgList &Args) { DriverOpts.Targets[TAPIVariant] = Variant; } - DriverOpts.Verbose = Args.hasArgNoClaim(drv::OPT_v); + DriverOpts.Verbose = Args.hasArgNoClaim(options::OPT_v); return true; } @@ -407,7 +405,7 @@ bool Options::processOptionList(InputArgList &Args, bool Options::processLinkerOptions(InputArgList &Args) { // Handle required arguments. - if (const Arg *A = Args.getLastArg(drv::OPT_install__name)) + if (const Arg *A = Args.getLastArg(options::OPT_install__name)) LinkerOpts.InstallName = A->getValue(); if (LinkerOpts.InstallName.empty()) { Diags->Report(diag::err_no_install_name); @@ -415,28 +413,29 @@ bool Options::processLinkerOptions(InputArgList &Args) { } // Defaulted or optional arguments. - if (auto *Arg = Args.getLastArg(drv::OPT_current__version)) + if (auto *Arg = Args.getLastArg(options::OPT_current__version)) LinkerOpts.CurrentVersion.parse64(Arg->getValue()); - if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version)) + if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version)) LinkerOpts.CompatVersion.parse64(Arg->getValue()); - if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version)) + if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version)) LinkerOpts.CompatVersion.parse64(Arg->getValue()); - if (auto *Arg = Args.getLastArg(drv::OPT_umbrella)) + if (auto *Arg = Args.getLastArg(options::OPT_umbrella)) LinkerOpts.ParentUmbrella = Arg->getValue(); - LinkerOpts.IsDylib = Args.hasArg(drv::OPT_dynamiclib); + LinkerOpts.IsDylib = Args.hasArg(options::OPT_dynamiclib); - for (auto *Arg : Args.filtered(drv::OPT_alias_list)) { + for (auto *Arg : Args.filtered(options::OPT_alias_list)) { LinkerOpts.AliasLists.emplace_back(Arg->getValue()); Arg->claim(); } - LinkerOpts.AppExtensionSafe = Args.hasFlag( - drv::OPT_fapplication_extension, drv::OPT_fno_application_extension, - /*Default=*/LinkerOpts.AppExtensionSafe); + LinkerOpts.AppExtensionSafe = + Args.hasFlag(options::OPT_fapplication_extension, + options::OPT_fno_application_extension, + /*Default=*/LinkerOpts.AppExtensionSafe); if (::getenv("LD_NO_ENCRYPT") != nullptr) LinkerOpts.AppExtensionSafe = true; @@ -446,7 +445,7 @@ bool Options::processLinkerOptions(InputArgList &Args) { // Capture library paths. PathSeq LibraryPaths; - for (const Arg *A : Args.filtered(drv::OPT_L)) { + for (const Arg *A : Args.filtered(options::OPT_L)) { LibraryPaths.emplace_back(A->getValue()); A->claim(); } @@ -461,7 +460,7 @@ bool Options::processLinkerOptions(InputArgList &Args) { // invocations. bool Options::processFrontendOptions(InputArgList &Args) { // Capture language mode. - if (auto *A = Args.getLastArgNoClaim(drv::OPT_x)) { + if (auto *A = Args.getLastArgNoClaim(options::OPT_x)) { FEOpts.LangMode = llvm::StringSwitch(A->getValue()) .Case("c", clang::Language::C) .Case("c++", clang::Language::CXX) @@ -475,15 +474,15 @@ bool Options::processFrontendOptions(InputArgList &Args) { return false; } } - for (auto *A : Args.filtered(drv::OPT_ObjC, drv::OPT_ObjCXX)) { - if (A->getOption().matches(drv::OPT_ObjC)) + for (auto *A : Args.filtered(options::OPT_ObjC, options::OPT_ObjCXX)) { + if (A->getOption().matches(options::OPT_ObjC)) FEOpts.LangMode = clang::Language::ObjC; else FEOpts.LangMode = clang::Language::ObjCXX; } // Capture Sysroot. - if (const Arg *A = Args.getLastArgNoClaim(drv::OPT_isysroot)) { + if (const Arg *A = Args.getLastArgNoClaim(options::OPT_isysroot)) { SmallString Path(A->getValue()); FM->makeAbsolutePath(Path); if (!FM->getOptionalDirectoryRef(Path)) { @@ -502,13 +501,13 @@ bool Options::processFrontendOptions(InputArgList &Args) { } // Capture system frameworks for all platforms. - for (const Arg *A : Args.filtered(drv::OPT_iframework)) + for (const Arg *A : Args.filtered(options::OPT_iframework)) FEOpts.SystemFwkPaths.emplace_back(A->getValue(), std::optional{}); // Capture framework paths. PathSeq FrameworkPaths; - for (const Arg *A : Args.filtered(drv::OPT_F)) + for (const Arg *A : Args.filtered(options::OPT_F)) FrameworkPaths.emplace_back(A->getValue()); if (!FrameworkPaths.empty()) diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt index d9d36f7a41359..002aaef005253 100644 --- a/clang/tools/driver/CMakeLists.txt +++ b/clang/tools/driver/CMakeLists.txt @@ -63,6 +63,7 @@ clang_target_link_libraries(clang clangDriver clangFrontend clangFrontendTool + clangOptions clangSerialization ) diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp index 52cffa4ccbe1f..2aef75597fc5f 100644 --- a/clang/tools/driver/cc1_main.cpp +++ b/clang/tools/driver/cc1_main.cpp @@ -17,7 +17,6 @@ #include "clang/CodeGen/ObjectFilePCHContainerWriter.h" #include "clang/Config/config.h" #include "clang/Driver/DriverDiagnostic.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/CompilerInvocation.h" #include "clang/Frontend/FrontendDiagnostic.h" @@ -25,6 +24,7 @@ #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" #include "clang/FrontendTool/Utils.h" +#include "clang/Options/Options.h" #include "clang/Serialization/ObjectFilePCHContainerReader.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp index 50da2f8449a22..f13812f2a8383 100644 --- a/clang/tools/driver/cc1as_main.cpp +++ b/clang/tools/driver/cc1as_main.cpp @@ -14,10 +14,10 @@ #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticOptions.h" #include "clang/Driver/DriverDiagnostic.h" -#include "clang/Driver/Options.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" +#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" @@ -59,8 +59,7 @@ #include #include using namespace clang; -using namespace clang::driver; -using namespace clang::driver::options; +using namespace clang::options; using namespace llvm; using namespace llvm::opt; @@ -688,8 +687,7 @@ int cc1as_main(ArrayRef Argv, const char *Argv0, void *MainAddr) { getDriverOptTable().printHelp( llvm::outs(), "clang -cc1as [options] file...", "Clang Integrated Assembler", /*ShowHidden=*/false, - /*ShowAllAliases=*/false, - llvm::opt::Visibility(driver::options::CC1AsOption)); + /*ShowAllAliases=*/false, llvm::opt::Visibility(options::CC1AsOption)); return 0; } diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp index 7390d7d610ec0..1e2c9884ba63d 100644 --- a/clang/tools/driver/driver.cpp +++ b/clang/tools/driver/driver.cpp @@ -18,13 +18,13 @@ #include "clang/Config/config.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/DriverDiagnostic.h" -#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" #include "clang/Frontend/ChainedDiagnosticConsumer.h" #include "clang/Frontend/CompilerInvocation.h" #include "clang/Frontend/SerializedDiagnosticPrinter.h" #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp index 62274235c53f5..e0454f190b35a 100644 --- a/clang/unittests/Driver/DXCModeTest.cpp +++ b/clang/unittests/Driver/DXCModeTest.cpp @@ -131,8 +131,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) { TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None)}; EXPECT_NE(TranslatedArgs, nullptr); if (TranslatedArgs) { - auto *A = TranslatedArgs->getLastArg( - clang::driver::options::OPT_dxil_validator_version); + auto *A = + TranslatedArgs->getLastArg(clang::options::OPT_dxil_validator_version); EXPECT_NE(A, nullptr); if (A) { EXPECT_STREQ(A->getValue(), "1.1"); diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html index ae0ec1d4d12cb..3e5e84b5b2ed4 100755 --- a/clang/www/OpenProjects.html +++ b/clang/www/OpenProjects.html @@ -38,7 +38,7 @@

Open Clang Projects

  • documenting diagnostic group flags (adding code examples of what is diagnosed, or other relevant information), or
  • -
  • documenting +
  • documenting command line options, or
  • help with completing other missing documentation.
  • diff --git a/flang/docs/CMakeLists.txt b/flang/docs/CMakeLists.txt index 568f942cb4aa6..b183d6add1059 100644 --- a/flang/docs/CMakeLists.txt +++ b/flang/docs/CMakeLists.txt @@ -88,7 +88,7 @@ function (gen_rst_file_from_td output_file td_option source target) endif() get_filename_component(TABLEGEN_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${source}" DIRECTORY) list(APPEND LLVM_TABLEGEN_FLAGS "-I${TABLEGEN_INCLUDE_DIR}") - list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Driver/") + list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Options/") clang_tablegen(Source/${output_file} ${td_option} SOURCE ${source} TARGET ${target}) endfunction() diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md index 3286171bb1499..9953f2252218b 100644 --- a/flang/docs/FlangDriver.md +++ b/flang/docs/FlangDriver.md @@ -76,7 +76,7 @@ will ignore it when used without `Xflang`. As hinted above, `flang` and `flang -fc1` are two separate tools. The fact that these tools are accessed through one binary, `flang`, is just an implementation detail. Each tool has a separate list of options, albeit defined -in the same file: `clang/include/clang/Driver/Options.td`. +in the same file: `clang/include/clang/Options/Options.td`. The separation helps us split various tasks and allows us to implement more specialised tools. In particular, `flang` is not aware of various @@ -112,7 +112,7 @@ in terms of Clang's driver library, `clangDriver`. This approach allows us to: as linkers and assemblers. One implication of this dependency on Clang is that all of Flang's compiler options are defined alongside Clang's options in -`clang/include/clang/Driver/Options.td`. For options that are common for both +`clang/include/clang/Options/Options.td`. For options that are common for both Flang and Clang, the corresponding definitions are shared. Internally, a `clangDriver` based compiler driver works by creating actions @@ -242,7 +242,7 @@ Adding a new compiler option in Flang consists of two steps: ### Option Definition All of Flang's compiler and frontend driver options are defined in -`clang/include/clang/Driver/Options.td` in Clang. When adding a new option to +`clang/include/clang/Options/Options.td` in Clang. When adding a new option to Flang, you will either: * extend the existing definition for an option that is already available in one of Clang's drivers (e.g. `clang`), but not yet available in Flang, or @@ -314,7 +314,7 @@ add, you will have to add a dedicated entry in that enum (e.g. `ParseSyntaxOnly` for `-fsyntax-only`) and a corresponding `case` in `ParseFrontendArgs` function in the `CompilerInvocation.cpp` file, e.g.: ```cpp - case clang::driver::options::OPT_fsyntax_only: + case clang::options::OPT_fsyntax_only: opts.programAction = ParseSyntaxOnly; break; ``` diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt index 2b3bc0e9c2269..bb0b4a39cec9b 100644 --- a/flang/lib/Frontend/CMakeLists.txt +++ b/flang/lib/Frontend/CMakeLists.txt @@ -76,6 +76,7 @@ add_flang_library(flangFrontend CLANG_LIBS clangBasic clangDriver + clangOptions ) target_precompile_headers(flangFrontend PRIVATE diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index f05c4cfccf7fc..a9bb78d690f46 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -26,8 +26,8 @@ #include "clang/Basic/DiagnosticOptions.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Driver/OptionUtils.h" -#include "clang/Driver/Options.h" +#include "clang/Options/OptionUtils.h" +#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -82,11 +82,11 @@ static bool parseShowColorsArgs(const llvm::opt::ArgList &args, for (auto *a : args) { const llvm::opt::Option &opt = a->getOption(); - if (opt.matches(clang::driver::options::OPT_fcolor_diagnostics)) { + if (opt.matches(clang::options::OPT_fcolor_diagnostics)) { showColors = Colors_On; - } else if (opt.matches(clang::driver::options::OPT_fno_color_diagnostics)) { + } else if (opt.matches(clang::options::OPT_fno_color_diagnostics)) { showColors = Colors_Off; - } else if (opt.matches(clang::driver::options::OPT_fdiagnostics_color_EQ)) { + } else if (opt.matches(clang::options::OPT_fdiagnostics_color_EQ)) { llvm::StringRef value(a->getValue()); if (value == "always") showColors = Colors_On; @@ -107,15 +107,13 @@ static unsigned getOptimizationLevel(llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { unsigned defaultOpt = 0; - if (llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_O_Group)) { - if (a->getOption().matches(clang::driver::options::OPT_O0)) + if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_O_Group)) { + if (a->getOption().matches(clang::options::OPT_O0)) return 0; - assert(a->getOption().matches(clang::driver::options::OPT_O)); + assert(a->getOption().matches(clang::options::OPT_O)); - return getLastArgIntValue(args, clang::driver::options::OPT_O, defaultOpt, - diags); + return getLastArgIntValue(args, clang::options::OPT_O, defaultOpt, diags); } return defaultOpt; @@ -133,7 +131,7 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, clang::DiagnosticsEngine &diags) { using DebugInfoKind = llvm::codegenoptions::DebugInfoKind; if (llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_debug_info_kind_EQ)) { + args.getLastArg(clang::options::OPT_debug_info_kind_EQ)) { std::optional val = llvm::StringSwitch>(arg->getValue()) .Case("line-tables-only", llvm::codegenoptions::DebugLineTablesOnly) @@ -158,13 +156,13 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, diags.Report(debugWarning) << arg->getValue(); } opts.DwarfVersion = - getLastArgIntValue(args, clang::driver::options::OPT_dwarf_version_EQ, + getLastArgIntValue(args, clang::options::OPT_dwarf_version_EQ, /*Default=*/0, diags); if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_split_dwarf_file)) + args.getLastArg(clang::options::OPT_split_dwarf_file)) opts.SplitDwarfFile = a->getValue(); if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_split_dwarf_output)) + args.getLastArg(clang::options::OPT_split_dwarf_output)) opts.SplitDwarfOutput = a->getValue(); } return true; @@ -174,7 +172,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_fdo_concurrent_to_openmp_EQ); + args.getLastArg(clang::options::OPT_fdo_concurrent_to_openmp_EQ); if (!arg) return; @@ -199,7 +197,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fveclib); + llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fveclib); if (!arg) return true; @@ -237,7 +235,7 @@ parseOptimizationRemark(clang::DiagnosticsEngine &diags, CodeGenOptions::OptRemark result; for (llvm::opt::Arg *a : args) { - if (a->getOption().matches(clang::driver::options::OPT_R_Joined)) { + if (a->getOption().matches(clang::options::OPT_R_Joined)) { llvm::StringRef value = a->getValue(); if (value == remarkOptName) { @@ -274,39 +272,39 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::DiagnosticsEngine &diags) { opts.OptimizationLevel = getOptimizationLevel(args, diags); - if (args.hasFlag(clang::driver::options::OPT_fdebug_pass_manager, - clang::driver::options::OPT_fno_debug_pass_manager, false)) + if (args.hasFlag(clang::options::OPT_fdebug_pass_manager, + clang::options::OPT_fno_debug_pass_manager, false)) opts.DebugPassManager = 1; - if (args.hasFlag(clang::driver::options::OPT_fstack_arrays, - clang::driver::options::OPT_fno_stack_arrays, false)) + if (args.hasFlag(clang::options::OPT_fstack_arrays, + clang::options::OPT_fno_stack_arrays, false)) opts.StackArrays = 1; - if (args.getLastArg(clang::driver::options::OPT_floop_interchange)) + if (args.getLastArg(clang::options::OPT_floop_interchange)) opts.InterchangeLoops = 1; - if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion)) + if (args.getLastArg(clang::options::OPT_fexperimental_loop_fusion)) opts.FuseLoops = 1; - if (args.getLastArg(clang::driver::options::OPT_vectorize_loops)) + if (args.getLastArg(clang::options::OPT_vectorize_loops)) opts.VectorizeLoop = 1; - if (args.getLastArg(clang::driver::options::OPT_vectorize_slp)) + if (args.getLastArg(clang::options::OPT_vectorize_slp)) opts.VectorizeSLP = 1; - if (args.hasFlag(clang::driver::options::OPT_floop_versioning, - clang::driver::options::OPT_fno_loop_versioning, false)) + if (args.hasFlag(clang::options::OPT_floop_versioning, + clang::options::OPT_fno_loop_versioning, false)) opts.LoopVersioning = 1; - opts.UnrollLoops = args.hasFlag(clang::driver::options::OPT_funroll_loops, - clang::driver::options::OPT_fno_unroll_loops, + opts.UnrollLoops = args.hasFlag(clang::options::OPT_funroll_loops, + clang::options::OPT_fno_unroll_loops, (opts.OptimizationLevel > 1)); opts.AliasAnalysis = opts.OptimizationLevel > 0; // -mframe-pointer=none/non-leaf/reserved/all option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mframe_pointer_EQ)) { + args.getLastArg(clang::options::OPT_mframe_pointer_EQ)) { std::optional val = llvm::StringSwitch>(a->getValue()) .Case("none", llvm::FramePointerKind::None) @@ -322,7 +320,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, opts.setFramePointer(val.value()); } - for (auto *a : args.filtered(clang::driver::options::OPT_fpass_plugin_EQ)) + for (auto *a : args.filtered(clang::options::OPT_fpass_plugin_EQ)) opts.LLVMPassPlugins.push_back(a->getValue()); opts.Reciprocals = clang::driver::tools::parseMRecipOption(diags, args); @@ -331,15 +329,14 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::driver::tools::parseMPreferVectorWidthOption(diags, args); // -fembed-offload-object option - for (auto *a : - args.filtered(clang::driver::options::OPT_fembed_offload_object_EQ)) + for (auto *a : args.filtered(clang::options::OPT_fembed_offload_object_EQ)) opts.OffloadObjects.push_back(a->getValue()); - if (args.hasArg(clang::driver::options::OPT_finstrument_functions)) + if (args.hasArg(clang::options::OPT_finstrument_functions)) opts.InstrumentFunctions = 1; - if (const llvm::opt::Arg *a = args.getLastArg( - clang::driver::options::OPT_mcode_object_version_EQ)) { + if (const llvm::opt::Arg *a = + args.getLastArg(clang::options::OPT_mcode_object_version_EQ)) { llvm::StringRef s = a->getValue(); if (s == "6") opts.CodeObjectVersion = llvm::CodeObjectVersionKind::COV_6; @@ -353,36 +350,36 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, // -f[no-]save-optimization-record[=] if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_opt_record_file)) + args.getLastArg(clang::options::OPT_opt_record_file)) opts.OptRecordFile = a->getValue(); // Optimization file format. Defaults to yaml if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_opt_record_format)) + args.getLastArg(clang::options::OPT_opt_record_format)) opts.OptRecordFormat = a->getValue(); // Specifies, using a regex, which successful optimization passes(middle and // backend), to include in the final optimization record file generated. If // not provided -fsave-optimization-record will include all passes. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_opt_record_passes)) + args.getLastArg(clang::options::OPT_opt_record_passes)) opts.OptRecordPasses = a->getValue(); // Create OptRemark that allows printing of all successful optimization // passes applied. opts.OptimizationRemark = - parseOptimizationRemark(diags, args, clang::driver::options::OPT_Rpass_EQ, + parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_EQ, /*remarkOptName=*/"pass"); // Create OptRemark that allows all missed optimization passes to be printed. - opts.OptimizationRemarkMissed = parseOptimizationRemark( - diags, args, clang::driver::options::OPT_Rpass_missed_EQ, - /*remarkOptName=*/"pass-missed"); + opts.OptimizationRemarkMissed = + parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_missed_EQ, + /*remarkOptName=*/"pass-missed"); // Create OptRemark that allows all optimization decisions made by LLVM // to be printed. opts.OptimizationRemarkAnalysis = parseOptimizationRemark( - diags, args, clang::driver::options::OPT_Rpass_analysis_EQ, + diags, args, clang::options::OPT_Rpass_analysis_EQ, /*remarkOptName=*/"pass-analysis"); if (opts.getDebugInfo() == llvm::codegenoptions::NoDebugInfo) { @@ -400,23 +397,22 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, opts.setDebugInfo(llvm::codegenoptions::LocTrackingOnly); } - if (auto *a = args.getLastArg(clang::driver::options::OPT_save_temps_EQ)) + if (auto *a = args.getLastArg(clang::options::OPT_save_temps_EQ)) opts.SaveTempsDir = a->getValue(); // -record-command-line option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_record_command_line)) { + args.getLastArg(clang::options::OPT_record_command_line)) { opts.RecordCommandLine = a->getValue(); } // -mlink-builtin-bitcode - for (auto *a : - args.filtered(clang::driver::options::OPT_mlink_builtin_bitcode)) + for (auto *a : args.filtered(clang::options::OPT_mlink_builtin_bitcode)) opts.BuiltinBCLibs.push_back(a->getValue()); // -mrelocation-model option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mrelocation_model)) { + args.getLastArg(clang::options::OPT_mrelocation_model)) { llvm::StringRef modelName = a->getValue(); auto relocModel = llvm::StringSwitch>(modelName) @@ -435,31 +431,30 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, } // -pic-level and -pic-is-pie option. - if (int picLevel = getLastArgIntValue( - args, clang::driver::options::OPT_pic_level, 0, diags)) { + if (int picLevel = + getLastArgIntValue(args, clang::options::OPT_pic_level, 0, diags)) { if (picLevel > 2) diags.Report(clang::diag::err_drv_invalid_value) - << args.getLastArg(clang::driver::options::OPT_pic_level) - ->getAsString(args) + << args.getLastArg(clang::options::OPT_pic_level)->getAsString(args) << picLevel; opts.PICLevel = picLevel; - if (args.hasArg(clang::driver::options::OPT_pic_is_pie)) + if (args.hasArg(clang::options::OPT_pic_is_pie)) opts.IsPIE = 1; } - if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) { + if (args.hasArg(clang::options::OPT_fprofile_generate)) { opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr); } - if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) { + if (auto A = args.getLastArg(clang::options::OPT_fprofile_use_EQ)) { opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr); opts.ProfileInstrumentUsePath = A->getValue(); } // -mcmodel option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) { + args.getLastArg(clang::options::OPT_mcmodel_EQ)) { llvm::StringRef modelName = a->getValue(); std::optional codeModel = getCodeModel(modelName); @@ -470,8 +465,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, << a->getAsString(args) << modelName; } - if (const llvm::opt::Arg *arg = args.getLastArg( - clang::driver::options::OPT_mlarge_data_threshold_EQ)) { + if (const llvm::opt::Arg *arg = + args.getLastArg(clang::options::OPT_mlarge_data_threshold_EQ)) { uint64_t LDT; if (llvm::StringRef(arg->getValue()).getAsInteger(/*Radix=*/10, LDT)) { diags.Report(clang::diag::err_drv_invalid_value) @@ -481,15 +476,15 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, } // This option is compatible with -f[no-]underscoring in gfortran. - if (args.hasFlag(clang::driver::options::OPT_fno_underscoring, - clang::driver::options::OPT_funderscoring, false)) { + if (args.hasFlag(clang::options::OPT_fno_underscoring, + clang::options::OPT_funderscoring, false)) { opts.Underscoring = 0; } parseDoConcurrentMapping(opts, args, diags); if (const llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_complex_range_EQ)) { + args.getLastArg(clang::options::OPT_complex_range_EQ)) { llvm::StringRef argValue = llvm::StringRef(arg->getValue()); if (argValue == "full") { opts.setComplexRange(CodeGenOptions::ComplexRangeKind::CX_Full); @@ -510,46 +505,42 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, /// \param [in] opts The target options instance to update /// \param [in] args The list of input arguments (from the compiler invocation) static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_triple)) + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_triple)) opts.triple = a->getValue(); - opts.atomicIgnoreDenormalMode = args.hasFlag( - clang::driver::options::OPT_fatomic_ignore_denormal_mode, - clang::driver::options::OPT_fno_atomic_ignore_denormal_mode, false); - opts.atomicFineGrainedMemory = args.hasFlag( - clang::driver::options::OPT_fatomic_fine_grained_memory, - clang::driver::options::OPT_fno_atomic_fine_grained_memory, false); + opts.atomicIgnoreDenormalMode = + args.hasFlag(clang::options::OPT_fatomic_ignore_denormal_mode, + clang::options::OPT_fno_atomic_ignore_denormal_mode, false); + opts.atomicFineGrainedMemory = + args.hasFlag(clang::options::OPT_fatomic_fine_grained_memory, + clang::options::OPT_fno_atomic_fine_grained_memory, false); opts.atomicRemoteMemory = - args.hasFlag(clang::driver::options::OPT_fatomic_remote_memory, - clang::driver::options::OPT_fno_atomic_remote_memory, false); + args.hasFlag(clang::options::OPT_fatomic_remote_memory, + clang::options::OPT_fno_atomic_remote_memory, false); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_target_cpu)) + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_target_cpu)) opts.cpu = a->getValue(); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_tune_cpu)) + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_tune_cpu)) opts.cpuToTuneFor = a->getValue(); for (const llvm::opt::Arg *currentArg : - args.filtered(clang::driver::options::OPT_target_feature)) + args.filtered(clang::options::OPT_target_feature)) opts.featuresAsWritten.emplace_back(currentArg->getValue()); - if (args.hasArg(clang::driver::options::OPT_fdisable_real_10)) + if (args.hasArg(clang::options::OPT_fdisable_real_10)) opts.disabledRealKinds.push_back(10); - if (args.hasArg(clang::driver::options::OPT_fdisable_real_3)) + if (args.hasArg(clang::options::OPT_fdisable_real_3)) opts.disabledRealKinds.push_back(3); - if (args.hasArg(clang::driver::options::OPT_fdisable_integer_2)) + if (args.hasArg(clang::options::OPT_fdisable_integer_2)) opts.disabledIntegerKinds.push_back(2); - if (args.hasArg(clang::driver::options::OPT_fdisable_integer_16)) + if (args.hasArg(clang::options::OPT_fdisable_integer_16)) opts.disabledIntegerKinds.push_back(16); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_mabi_EQ)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_mabi_EQ)) { opts.abi = a->getValue(); llvm::StringRef V = a->getValue(); if (V == "vec-extabi") { @@ -559,9 +550,8 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { } } - opts.asmVerbose = - args.hasFlag(clang::driver::options::OPT_fverbose_asm, - clang::driver::options::OPT_fno_verbose_asm, false); + opts.asmVerbose = args.hasFlag(clang::options::OPT_fverbose_asm, + clang::options::OPT_fno_verbose_asm, false); } // Tweak the frontend configuration based on the frontend action static void setUpFrontendBasedOnAction(FrontendOptions &opts) { @@ -594,11 +584,11 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Treat multiple action options as an invocation error. Note that `clang // -cc1` does accept multiple action options, but will only consider the // rightmost one. - if (args.hasMultipleArgs(clang::driver::options::OPT_Action_Group)) { + if (args.hasMultipleArgs(clang::options::OPT_Action_Group)) { llvm::SmallString<32> buf; llvm::raw_svector_ostream os(buf); for (const llvm::opt::Arg *arg : - args.filtered(clang::driver::options::OPT_Action_Group)) { + args.filtered(clang::options::OPT_Action_Group)) { if (buf.size()) os << ", "; os << "'" << arg->getSpelling() << "'"; @@ -609,99 +599,99 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Identify the action (i.e. opts.ProgramAction) if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_Action_Group)) { + args.getLastArg(clang::options::OPT_Action_Group)) { switch (a->getOption().getID()) { default: { llvm_unreachable("Invalid option in group!"); } - case clang::driver::options::OPT_test_io: + case clang::options::OPT_test_io: opts.programAction = InputOutputTest; break; - case clang::driver::options::OPT_E: + case clang::options::OPT_E: opts.programAction = PrintPreprocessedInput; break; - case clang::driver::options::OPT_fsyntax_only: + case clang::options::OPT_fsyntax_only: opts.programAction = ParseSyntaxOnly; break; - case clang::driver::options::OPT_emit_fir: + case clang::options::OPT_emit_fir: opts.programAction = EmitFIR; break; - case clang::driver::options::OPT_emit_hlfir: + case clang::options::OPT_emit_hlfir: opts.programAction = EmitHLFIR; break; - case clang::driver::options::OPT_emit_llvm: + case clang::options::OPT_emit_llvm: opts.programAction = EmitLLVM; break; - case clang::driver::options::OPT_emit_llvm_bc: + case clang::options::OPT_emit_llvm_bc: opts.programAction = EmitLLVMBitcode; break; - case clang::driver::options::OPT_emit_obj: + case clang::options::OPT_emit_obj: opts.programAction = EmitObj; break; - case clang::driver::options::OPT_S: + case clang::options::OPT_S: opts.programAction = EmitAssembly; break; - case clang::driver::options::OPT_fdebug_unparse: + case clang::options::OPT_fdebug_unparse: opts.programAction = DebugUnparse; break; - case clang::driver::options::OPT_fdebug_unparse_no_sema: + case clang::options::OPT_fdebug_unparse_no_sema: opts.programAction = DebugUnparseNoSema; break; - case clang::driver::options::OPT_fdebug_unparse_with_symbols: + case clang::options::OPT_fdebug_unparse_with_symbols: opts.programAction = DebugUnparseWithSymbols; break; - case clang::driver::options::OPT_fdebug_unparse_with_modules: + case clang::options::OPT_fdebug_unparse_with_modules: opts.programAction = DebugUnparseWithModules; break; - case clang::driver::options::OPT_fdebug_dump_symbols: + case clang::options::OPT_fdebug_dump_symbols: opts.programAction = DebugDumpSymbols; break; - case clang::driver::options::OPT_fdebug_dump_parse_tree: + case clang::options::OPT_fdebug_dump_parse_tree: opts.programAction = DebugDumpParseTree; break; - case clang::driver::options::OPT_fdebug_dump_pft: + case clang::options::OPT_fdebug_dump_pft: opts.programAction = DebugDumpPFT; break; - case clang::driver::options::OPT_fdebug_dump_all: + case clang::options::OPT_fdebug_dump_all: opts.programAction = DebugDumpAll; break; - case clang::driver::options::OPT_fdebug_dump_parse_tree_no_sema: + case clang::options::OPT_fdebug_dump_parse_tree_no_sema: opts.programAction = DebugDumpParseTreeNoSema; break; - case clang::driver::options::OPT_fdebug_dump_provenance: + case clang::options::OPT_fdebug_dump_provenance: opts.programAction = DebugDumpProvenance; break; - case clang::driver::options::OPT_fdebug_dump_parsing_log: + case clang::options::OPT_fdebug_dump_parsing_log: opts.programAction = DebugDumpParsingLog; break; - case clang::driver::options::OPT_fdebug_measure_parse_tree: + case clang::options::OPT_fdebug_measure_parse_tree: opts.programAction = DebugMeasureParseTree; break; - case clang::driver::options::OPT_fdebug_pre_fir_tree: + case clang::options::OPT_fdebug_pre_fir_tree: opts.programAction = DebugPreFIRTree; break; - case clang::driver::options::OPT_fget_symbols_sources: + case clang::options::OPT_fget_symbols_sources: opts.programAction = GetSymbolsSources; break; - case clang::driver::options::OPT_fget_definition: + case clang::options::OPT_fget_definition: opts.programAction = GetDefinition; break; - case clang::driver::options::OPT_init_only: + case clang::options::OPT_init_only: opts.programAction = InitOnly; break; // TODO: - // case clang::driver::options::OPT_emit_llvm: - // case clang::driver::options::OPT_emit_llvm_only: - // case clang::driver::options::OPT_emit_codegen_only: - // case clang::driver::options::OPT_emit_module: + // case clang::options::OPT_emit_llvm: + // case clang::options::OPT_emit_llvm_only: + // case clang::options::OPT_emit_codegen_only: + // case clang::options::OPT_emit_module: // (...) } // Parse the values provided with `-fget-definition` (there should be 3 // integers) if (llvm::opt::OptSpecifier(a->getOption().getID()) == - clang::driver::options::OPT_fget_definition) { + clang::options::OPT_fget_definition) { unsigned optVals[3] = {0, 0, 0}; for (unsigned i = 0; i < 3; i++) { @@ -721,27 +711,25 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Parsing -load option and storing shared object path - if (llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_load)) { + if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_load)) { opts.plugins.push_back(a->getValue()); } // Parsing -plugin option and storing plugin name and setting action - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_plugin)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_plugin)) { opts.programAction = PluginAction; opts.actionName = a->getValue(); } - opts.outputFile = args.getLastArgValue(clang::driver::options::OPT_o); - opts.showHelp = args.hasArg(clang::driver::options::OPT_help); - opts.showVersion = args.hasArg(clang::driver::options::OPT_version); + opts.outputFile = args.getLastArgValue(clang::options::OPT_o); + opts.showHelp = args.hasArg(clang::options::OPT_help); + opts.showVersion = args.hasArg(clang::options::OPT_version); opts.printSupportedCPUs = - args.hasArg(clang::driver::options::OPT_print_supported_cpus); + args.hasArg(clang::options::OPT_print_supported_cpus); // Get the input kind (from the value passed via `-x`) InputKind dashX(Language::Unknown); - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_x)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_x)) { llvm::StringRef xValue = a->getValue(); // Principal languages. dashX = llvm::StringSwitch(xValue) @@ -768,7 +756,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Collect the input files and save them in our instance of FrontendOptions. std::vector inputs = - args.getAllArgValues(clang::driver::options::OPT_INPUT); + args.getAllArgValues(clang::options::OPT_INPUT); opts.inputs.clear(); if (inputs.empty()) // '-' is the default input if none is given. @@ -788,18 +776,16 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Set fortranForm based on options -ffree-form and -ffixed-form. - if (const auto *arg = - args.getLastArg(clang::driver::options::OPT_ffixed_form, - clang::driver::options::OPT_ffree_form)) { - opts.fortranForm = - arg->getOption().matches(clang::driver::options::OPT_ffixed_form) - ? FortranForm::FixedForm - : FortranForm::FreeForm; + if (const auto *arg = args.getLastArg(clang::options::OPT_ffixed_form, + clang::options::OPT_ffree_form)) { + opts.fortranForm = arg->getOption().matches(clang::options::OPT_ffixed_form) + ? FortranForm::FixedForm + : FortranForm::FreeForm; } // Set fixedFormColumns based on -ffixed-line-length= if (const auto *arg = - args.getLastArg(clang::driver::options::OPT_ffixed_line_length_EQ)) { + args.getLastArg(clang::options::OPT_ffixed_line_length_EQ)) { llvm::StringRef argValue = llvm::StringRef(arg->getValue()); std::int64_t columns = -1; if (argValue == "none") { @@ -821,8 +807,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Set conversion based on -fconvert= - if (const auto *arg = - args.getLastArg(clang::driver::options::OPT_fconvert_EQ)) { + if (const auto *arg = args.getLastArg(clang::options::OPT_fconvert_EQ)) { const char *argValue = arg->getValue(); if (auto convert = parseConvertArg(argValue)) opts.envDefaults.push_back({"FORT_CONVERT", *convert}); @@ -832,59 +817,55 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // -f{no-}implicit-none - opts.features.Enable( - Fortran::common::LanguageFeature::ImplicitNoneTypeAlways, - args.hasFlag(clang::driver::options::OPT_fimplicit_none, - clang::driver::options::OPT_fno_implicit_none, false)); + opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneTypeAlways, + args.hasFlag(clang::options::OPT_fimplicit_none, + clang::options::OPT_fno_implicit_none, + false)); // -f{no-}implicit-none-ext - opts.features.Enable( - Fortran::common::LanguageFeature::ImplicitNoneExternal, - args.hasFlag(clang::driver::options::OPT_fimplicit_none_ext, - clang::driver::options::OPT_fno_implicit_none_ext, false)); + opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneExternal, + args.hasFlag(clang::options::OPT_fimplicit_none_ext, + clang::options::OPT_fno_implicit_none_ext, + false)); // -f{no-}backslash opts.features.Enable(Fortran::common::LanguageFeature::BackslashEscapes, - args.hasFlag(clang::driver::options::OPT_fbackslash, - clang::driver::options::OPT_fno_backslash, - false)); + args.hasFlag(clang::options::OPT_fbackslash, + clang::options::OPT_fno_backslash, false)); // -f{no-}logical-abbreviations opts.features.Enable( Fortran::common::LanguageFeature::LogicalAbbreviations, - args.hasFlag(clang::driver::options::OPT_flogical_abbreviations, - clang::driver::options::OPT_fno_logical_abbreviations, - false)); + args.hasFlag(clang::options::OPT_flogical_abbreviations, + clang::options::OPT_fno_logical_abbreviations, false)); // -f{no-}unsigned opts.features.Enable(Fortran::common::LanguageFeature::Unsigned, - args.hasFlag(clang::driver::options::OPT_funsigned, - clang::driver::options::OPT_fno_unsigned, - false)); + args.hasFlag(clang::options::OPT_funsigned, + clang::options::OPT_fno_unsigned, false)); // -f{no-}xor-operator - opts.features.Enable( - Fortran::common::LanguageFeature::XOROperator, - args.hasFlag(clang::driver::options::OPT_fxor_operator, - clang::driver::options::OPT_fno_xor_operator, false)); + opts.features.Enable(Fortran::common::LanguageFeature::XOROperator, + args.hasFlag(clang::options::OPT_fxor_operator, + clang::options::OPT_fno_xor_operator, + false)); // -fno-automatic - if (args.hasArg(clang::driver::options::OPT_fno_automatic)) { + if (args.hasArg(clang::options::OPT_fno_automatic)) { opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave); } // -f{no}-save-main-program - opts.features.Enable( - Fortran::common::LanguageFeature::SaveMainProgram, - args.hasFlag(clang::driver::options::OPT_fsave_main_program, - clang::driver::options::OPT_fno_save_main_program, false)); + opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram, + args.hasFlag(clang::options::OPT_fsave_main_program, + clang::options::OPT_fno_save_main_program, + false)); - if (args.hasArg( - clang::driver::options::OPT_falternative_parameter_statement)) { + if (args.hasArg(clang::options::OPT_falternative_parameter_statement)) { opts.features.Enable(Fortran::common::LanguageFeature::OldStyleParameter); } if (const llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_finput_charset_EQ)) { + args.getLastArg(clang::options::OPT_finput_charset_EQ)) { llvm::StringRef argValue = arg->getValue(); if (argValue == "utf-8") { opts.encoding = Fortran::parser::Encoding::UTF_8; @@ -929,9 +910,9 @@ static std::string getOpenMPHeadersDir(const char *argv) { static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts, llvm::opt::ArgList &args) { // Add macros from the command line. - for (const auto *currentArg : args.filtered(clang::driver::options::OPT_D, - clang::driver::options::OPT_U)) { - if (currentArg->getOption().matches(clang::driver::options::OPT_D)) { + for (const auto *currentArg : + args.filtered(clang::options::OPT_D, clang::options::OPT_U)) { + if (currentArg->getOption().matches(clang::options::OPT_D)) { opts.addMacroDef(currentArg->getValue()); } else { opts.addMacroUndef(currentArg->getValue()); @@ -939,34 +920,33 @@ static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts, } // Add the ordered list of -I's. - for (const auto *currentArg : args.filtered(clang::driver::options::OPT_I)) + for (const auto *currentArg : args.filtered(clang::options::OPT_I)) opts.searchDirectoriesFromDashI.emplace_back(currentArg->getValue()); // Prepend the ordered list of -intrinsic-modules-path // to the default location to search. for (const auto *currentArg : - args.filtered(clang::driver::options::OPT_fintrinsic_modules_path)) + args.filtered(clang::options::OPT_fintrinsic_modules_path)) opts.searchDirectoriesFromIntrModPath.emplace_back(currentArg->getValue()); // -cpp/-nocpp - if (const auto *currentArg = args.getLastArg( - clang::driver::options::OPT_cpp, clang::driver::options::OPT_nocpp)) - opts.macrosFlag = - (currentArg->getOption().matches(clang::driver::options::OPT_cpp)) - ? PPMacrosFlag::Include - : PPMacrosFlag::Exclude; + if (const auto *currentArg = + args.getLastArg(clang::options::OPT_cpp, clang::options::OPT_nocpp)) + opts.macrosFlag = (currentArg->getOption().matches(clang::options::OPT_cpp)) + ? PPMacrosFlag::Include + : PPMacrosFlag::Exclude; // Enable -cpp based on -x unless explicitly disabled with -nocpp if (opts.macrosFlag != PPMacrosFlag::Exclude) - if (const auto *dashX = args.getLastArg(clang::driver::options::OPT_x)) + if (const auto *dashX = args.getLastArg(clang::options::OPT_x)) opts.macrosFlag = llvm::StringSwitch(dashX->getValue()) .Case("f95-cpp-input", PPMacrosFlag::Include) .Default(opts.macrosFlag); - opts.noReformat = args.hasArg(clang::driver::options::OPT_fno_reformat); + opts.noReformat = args.hasArg(clang::options::OPT_fno_reformat); opts.preprocessIncludeLines = - args.hasArg(clang::driver::options::OPT_fpreprocess_include_lines); - opts.noLineDirectives = args.hasArg(clang::driver::options::OPT_P); - opts.showMacros = args.hasArg(clang::driver::options::OPT_dM); + args.hasArg(clang::options::OPT_fpreprocess_include_lines); + opts.noLineDirectives = args.hasArg(clang::options::OPT_P); + opts.showMacros = args.hasArg(clang::options::OPT_dM); } /// Parses all semantic related arguments and populates the variables @@ -977,7 +957,7 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -J/module-dir option std::vector moduleDirList = - args.getAllArgValues(clang::driver::options::OPT_module_dir); + args.getAllArgValues(clang::options::OPT_module_dir); // User can only specify one -J/-module-dir directory, but may repeat // -J/-module-dir as long as the directory is the same each time. // https://gcc.gnu.org/onlinedocs/gfortran/Directory-Options.html @@ -996,25 +976,25 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args, res.setModuleDir(moduleDirList[0]); // -fdebug-module-writer option - if (args.hasArg(clang::driver::options::OPT_fdebug_module_writer)) { + if (args.hasArg(clang::options::OPT_fdebug_module_writer)) { res.setDebugModuleDir(true); } // -fhermetic-module-files option - if (args.hasArg(clang::driver::options::OPT_fhermetic_module_files)) { + if (args.hasArg(clang::options::OPT_fhermetic_module_files)) { res.setHermeticModuleFileOutput(true); } // -module-suffix if (const auto *moduleSuffix = - args.getLastArg(clang::driver::options::OPT_module_suffix)) { + args.getLastArg(clang::options::OPT_module_suffix)) { res.setModuleFileSuffix(moduleSuffix->getValue()); } // -f{no-}analyzed-objects-for-unparse - res.setUseAnalyzedObjectsForUnparse(args.hasFlag( - clang::driver::options::OPT_fanalyzed_objects_for_unparse, - clang::driver::options::OPT_fno_analyzed_objects_for_unparse, true)); + res.setUseAnalyzedObjectsForUnparse( + args.hasFlag(clang::options::OPT_fanalyzed_objects_for_unparse, + clang::options::OPT_fno_analyzed_objects_for_unparse, true)); return diags.getNumErrors() == numErrorsBefore; } @@ -1031,7 +1011,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // chosen to match clang's behavior. // -pedantic - if (args.hasArg(clang::driver::options::OPT_pedantic)) { + if (args.hasArg(clang::options::OPT_pedantic)) { features.WarnOnAllNonstandard(); features.WarnOnAllUsage(); res.setEnableConformanceChecks(); @@ -1041,9 +1021,8 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -Werror option // TODO: Currently throws a Diagnostic for anything other than -W, // this has to change when other -W's are supported. - if (args.hasArg(clang::driver::options::OPT_W_Joined)) { - const auto &wArgs = - args.getAllArgValues(clang::driver::options::OPT_W_Joined); + if (args.hasArg(clang::options::OPT_W_Joined)) { + const auto &wArgs = args.getAllArgValues(clang::options::OPT_W_Joined); for (const auto &wArg : wArgs) { if (wArg == "error") { res.setWarnAsErr(true); @@ -1060,7 +1039,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -w - if (args.hasArg(clang::driver::options::OPT_w)) { + if (args.hasArg(clang::options::OPT_w)) { features.DisableAllWarnings(); res.setDisableWarnings(); } @@ -1080,7 +1059,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, unsigned numErrorsBefore = diags.getNumErrors(); // -fd-lines-as-code - if (args.hasArg(clang::driver::options::OPT_fd_lines_as_code)) { + if (args.hasArg(clang::options::OPT_fd_lines_as_code)) { if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) { const unsigned fdLinesAsWarning = diags.getCustomDiagID( clang::DiagnosticsEngine::Warning, @@ -1093,7 +1072,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -fd-lines-as-comments - if (args.hasArg(clang::driver::options::OPT_fd_lines_as_comments)) { + if (args.hasArg(clang::options::OPT_fd_lines_as_comments)) { if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) { const unsigned fdLinesAsWarning = diags.getCustomDiagID( clang::DiagnosticsEngine::Warning, @@ -1106,18 +1085,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -fdefault* family - if (args.hasArg(clang::driver::options::OPT_fdefault_real_8)) { + if (args.hasArg(clang::options::OPT_fdefault_real_8)) { res.getDefaultKinds().set_defaultRealKind(8); res.getDefaultKinds().set_doublePrecisionKind(16); } - if (args.hasArg(clang::driver::options::OPT_fdefault_integer_8)) { + if (args.hasArg(clang::options::OPT_fdefault_integer_8)) { res.getDefaultKinds().set_defaultIntegerKind(8); res.getDefaultKinds().set_subscriptIntegerKind(8); res.getDefaultKinds().set_sizeIntegerKind(8); res.getDefaultKinds().set_defaultLogicalKind(8); } - if (args.hasArg(clang::driver::options::OPT_fdefault_double_8)) { - if (!args.hasArg(clang::driver::options::OPT_fdefault_real_8)) { + if (args.hasArg(clang::options::OPT_fdefault_double_8)) { + if (!args.hasArg(clang::options::OPT_fdefault_real_8)) { // -fdefault-double-8 has to be used with -fdefault-real-8 // to be compatible with gfortran const unsigned diagID = diags.getCustomDiagID( @@ -1128,18 +1107,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // https://gcc.gnu.org/onlinedocs/gfortran/Fortran-Dialect-Options.html res.getDefaultKinds().set_doublePrecisionKind(8); } - if (args.hasArg(clang::driver::options::OPT_flarge_sizes)) + if (args.hasArg(clang::options::OPT_flarge_sizes)) res.getDefaultKinds().set_sizeIntegerKind(8); // -x cuda - auto language = args.getLastArgValue(clang::driver::options::OPT_x); + auto language = args.getLastArgValue(clang::options::OPT_x); if (language == "cuda") { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::CUDA); } // -fopenacc - if (args.hasArg(clang::driver::options::OPT_fopenacc)) { + if (args.hasArg(clang::options::OPT_fopenacc)) { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::OpenACC); } @@ -1147,8 +1126,8 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -std=f2018 // TODO: Set proper options when more fortran standards // are supported. - if (args.hasArg(clang::driver::options::OPT_std_EQ)) { - auto standard = args.getLastArgValue(clang::driver::options::OPT_std_EQ); + if (args.hasArg(clang::options::OPT_std_EQ)) { + auto standard = args.getLastArgValue(clang::options::OPT_std_EQ); // We only allow f2018 as the given standard if (standard == "f2018") { res.setEnableConformanceChecks(); @@ -1161,7 +1140,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } } // -fcoarray - if (args.hasArg(clang::driver::options::OPT_fcoarray)) { + if (args.hasArg(clang::options::OPT_fcoarray)) { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::Coarray); const unsigned diagID = @@ -1179,13 +1158,12 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, /// generated. static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp, - clang::driver::options::OPT_fno_openmp); - if (!arg || - arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) { - bool isSimdSpecified = args.hasFlag( - clang::driver::options::OPT_fopenmp_simd, - clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false); + llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fopenmp, + clang::options::OPT_fno_openmp); + if (!arg || arg->getOption().matches(clang::options::OPT_fno_openmp)) { + bool isSimdSpecified = + args.hasFlag(clang::options::OPT_fopenmp_simd, + clang::options::OPT_fno_openmp_simd, /*Default=*/false); if (!isSimdSpecified) return true; res.getLangOpts().OpenMPSimd = 1; @@ -1200,8 +1178,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, res.getLangOpts().OpenMPVersion = newestFullySupported; res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::OpenMP); - if (auto *arg = - args.getLastArg(clang::driver::options::OPT_fopenmp_version_EQ)) { + if (auto *arg = args.getLastArg(clang::options::OPT_fopenmp_version_EQ)) { llvm::ArrayRef ompVersions = llvm::omp::getOpenMPVersions(); unsigned oldVersions[] = {11, 20, 25, 30}; unsigned version = 0; @@ -1254,16 +1231,16 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } } - if (args.hasArg(clang::driver::options::OPT_fopenmp_force_usm)) { + if (args.hasArg(clang::options::OPT_fopenmp_force_usm)) { res.getLangOpts().OpenMPForceUSM = 1; } - if (args.hasArg(clang::driver::options::OPT_fopenmp_is_target_device)) { + if (args.hasArg(clang::options::OPT_fopenmp_is_target_device)) { res.getLangOpts().OpenMPIsTargetDevice = 1; // Get OpenMP host file path if any and report if a non existent file is // found - if (auto *arg = args.getLastArg( - clang::driver::options::OPT_fopenmp_host_ir_file_path)) { + if (auto *arg = + args.getLastArg(clang::options::OPT_fopenmp_host_ir_file_path)) { res.getLangOpts().OMPHostIRFile = arg->getValue(); if (!llvm::sys::fs::exists(res.getLangOpts().OMPHostIRFile)) diags.Report(clang::diag::err_omp_host_ir_file_not_found) @@ -1271,37 +1248,34 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } if (args.hasFlag( - clang::driver::options::OPT_fopenmp_assume_teams_oversubscription, - clang::driver::options:: - OPT_fno_openmp_assume_teams_oversubscription, + clang::options::OPT_fopenmp_assume_teams_oversubscription, + clang::options::OPT_fno_openmp_assume_teams_oversubscription, /*Default=*/false)) res.getLangOpts().OpenMPTeamSubscription = true; - if (args.hasArg(clang::driver::options::OPT_fopenmp_assume_no_thread_state)) + if (args.hasArg(clang::options::OPT_fopenmp_assume_no_thread_state)) res.getLangOpts().OpenMPNoThreadState = 1; - if (args.hasArg( - clang::driver::options::OPT_fopenmp_assume_no_nested_parallelism)) + if (args.hasArg(clang::options::OPT_fopenmp_assume_no_nested_parallelism)) res.getLangOpts().OpenMPNoNestedParallelism = 1; if (args.hasFlag( - clang::driver::options::OPT_fopenmp_assume_threads_oversubscription, - clang::driver::options:: - OPT_fno_openmp_assume_threads_oversubscription, + clang::options::OPT_fopenmp_assume_threads_oversubscription, + clang::options::OPT_fno_openmp_assume_threads_oversubscription, /*Default=*/false)) res.getLangOpts().OpenMPThreadSubscription = true; - if ((args.hasArg(clang::driver::options::OPT_fopenmp_target_debug) || - args.hasArg(clang::driver::options::OPT_fopenmp_target_debug_EQ))) { - res.getLangOpts().OpenMPTargetDebug = getLastArgIntValue( - args, clang::driver::options::OPT_fopenmp_target_debug_EQ, - res.getLangOpts().OpenMPTargetDebug, diags); + if ((args.hasArg(clang::options::OPT_fopenmp_target_debug) || + args.hasArg(clang::options::OPT_fopenmp_target_debug_EQ))) { + res.getLangOpts().OpenMPTargetDebug = + getLastArgIntValue(args, clang::options::OPT_fopenmp_target_debug_EQ, + res.getLangOpts().OpenMPTargetDebug, diags); if (!res.getLangOpts().OpenMPTargetDebug && - args.hasArg(clang::driver::options::OPT_fopenmp_target_debug)) + args.hasArg(clang::options::OPT_fopenmp_target_debug)) res.getLangOpts().OpenMPTargetDebug = 1; } - if (args.hasArg(clang::driver::options::OPT_no_offloadlib)) + if (args.hasArg(clang::options::OPT_no_offloadlib)) res.getLangOpts().NoGPULib = 1; } if (llvm::Triple(res.getTargetOpts().triple).isGPU()) { @@ -1317,8 +1291,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // Get the OpenMP target triples if any. - if (auto *arg = - args.getLastArg(clang::driver::options::OPT_offload_targets_EQ)) { + if (auto *arg = args.getLastArg(clang::options::OPT_offload_targets_EQ)) { enum ArchPtrSize { Arch16Bit, Arch32Bit, Arch64Bit }; auto getArchPtrSize = [](const llvm::Triple &triple) { if (triple.isArch16Bit()) @@ -1361,7 +1334,7 @@ static bool parseIntegerOverflowArgs(CompilerInvocation &invoc, clang::DiagnosticsEngine &diags) { Fortran::common::LangOptions &opts = invoc.getLangOpts(); - if (args.getLastArg(clang::driver::options::OPT_fwrapv)) + if (args.getLastArg(clang::options::OPT_fwrapv)) opts.setSignedOverflowBehavior(Fortran::common::LangOptions::SOB_Defined); return true; @@ -1380,7 +1353,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, Fortran::common::LangOptions &opts = invoc.getLangOpts(); if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_ffp_contract)) { + args.getLastArg(clang::options::OPT_ffp_contract)) { const llvm::StringRef val = a->getValue(); enum Fortran::common::LangOptions::FPModeKind fpContractMode; @@ -1397,31 +1370,31 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, opts.setFPContractMode(fpContractMode); } - if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) { + if (args.getLastArg(clang::options::OPT_menable_no_infs)) { opts.NoHonorInfs = true; } - if (args.getLastArg(clang::driver::options::OPT_menable_no_nans)) { + if (args.getLastArg(clang::options::OPT_menable_no_nans)) { opts.NoHonorNaNs = true; } - if (args.getLastArg(clang::driver::options::OPT_fapprox_func)) { + if (args.getLastArg(clang::options::OPT_fapprox_func)) { opts.ApproxFunc = true; } - if (args.getLastArg(clang::driver::options::OPT_fno_signed_zeros)) { + if (args.getLastArg(clang::options::OPT_fno_signed_zeros)) { opts.NoSignedZeros = true; } - if (args.getLastArg(clang::driver::options::OPT_mreassociate)) { + if (args.getLastArg(clang::options::OPT_mreassociate)) { opts.AssociativeMath = true; } - if (args.getLastArg(clang::driver::options::OPT_freciprocal_math)) { + if (args.getLastArg(clang::options::OPT_freciprocal_math)) { opts.ReciprocalMath = true; } - if (args.getLastArg(clang::driver::options::OPT_ffast_math)) { + if (args.getLastArg(clang::options::OPT_ffast_math)) { opts.NoHonorInfs = true; opts.NoHonorNaNs = true; opts.AssociativeMath = true; @@ -1431,7 +1404,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast); } - if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod)) + if (args.hasArg(clang::options::OPT_fno_fast_real_mod)) opts.NoFastRealMod = true; return true; @@ -1446,10 +1419,8 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, /// \param [out] diags DiagnosticsEngine to report erros with static bool parseVScaleArgs(CompilerInvocation &invoc, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - const auto *vscaleMin = - args.getLastArg(clang::driver::options::OPT_mvscale_min_EQ); - const auto *vscaleMax = - args.getLastArg(clang::driver::options::OPT_mvscale_max_EQ); + const auto *vscaleMin = args.getLastArg(clang::options::OPT_mvscale_min_EQ); + const auto *vscaleMax = args.getLastArg(clang::options::OPT_mvscale_max_EQ); if (!vscaleMin && !vscaleMax) return true; @@ -1497,8 +1468,7 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, // TODO: support --dependent-lib on other platforms when MLIR supports // !llvm.dependent.lib - if (args.hasArg(clang::driver::options::OPT_dependent_lib) && - !triple.isOSWindows()) { + if (args.hasArg(clang::options::OPT_dependent_lib) && !triple.isOSWindows()) { const unsigned diagID = diags.getCustomDiagID(clang::DiagnosticsEngine::Error, "--dependent-lib is only supported on Windows"); @@ -1506,12 +1476,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, return false; } - opts.DependentLibs = - args.getAllArgValues(clang::driver::options::OPT_dependent_lib); + opts.DependentLibs = args.getAllArgValues(clang::options::OPT_dependent_lib); // -flto=full/thin option. - if (const llvm::opt::Arg *a = - args.getLastArg(clang::driver::options::OPT_flto_EQ)) { + if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_flto_EQ)) { llvm::StringRef s = a->getValue(); assert((s == "full" || s == "thin") && "Unknown LTO mode."); if (s == "full") @@ -1522,10 +1490,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, // -ffat-lto-objects if (const llvm::opt::Arg *arg = - args.getLastArg(clang::driver::options::OPT_ffat_lto_objects, - clang::driver::options::OPT_fno_fat_lto_objects)) { + args.getLastArg(clang::options::OPT_ffat_lto_objects, + clang::options::OPT_fno_fat_lto_objects)) { opts.PrepareForFatLTO = - arg->getOption().matches(clang::driver::options::OPT_ffat_lto_objects); + arg->getOption().matches(clang::options::OPT_ffat_lto_objects); if (opts.PrepareForFatLTO) { assert((opts.PrepareForFullLTO || opts.PrepareForThinLTO) && "Unknown LTO mode"); @@ -1566,8 +1534,8 @@ bool CompilerInvocation::createFromArgs( llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()); // Parse the arguments - const llvm::opt::OptTable &opts = clang::driver::getDriverOptTable(); - llvm::opt::Visibility visibilityMask(clang::driver::options::FC1Option); + const llvm::opt::OptTable &opts = clang::getDriverOptTable(); + llvm::opt::Visibility visibilityMask(clang::options::FC1Option); unsigned missingArgIndex, missingArgCount; llvm::opt::InputArgList args = opts.ParseArgs( commandLineArgs, missingArgIndex, missingArgCount, visibilityMask); @@ -1580,7 +1548,7 @@ bool CompilerInvocation::createFromArgs( } // Issue errors on unknown arguments - for (const auto *a : args.filtered(clang::driver::options::OPT_UNKNOWN)) { + for (const auto *a : args.filtered(clang::options::OPT_UNKNOWN)) { auto argString = a->getAsString(args); std::string nearest; if (opts.findNearest(argString, nearest, visibilityMask) > 1) @@ -1592,15 +1560,15 @@ bool CompilerInvocation::createFromArgs( } // -flang-experimental-hlfir - if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir) || - args.hasArg(clang::driver::options::OPT_emit_hlfir)) { + if (args.hasArg(clang::options::OPT_flang_experimental_hlfir) || + args.hasArg(clang::options::OPT_emit_hlfir)) { invoc.loweringOpts.setLowerToHighLevelFIR(true); } // -flang-deprecated-no-hlfir - if (args.hasArg(clang::driver::options::OPT_flang_deprecated_no_hlfir) && - !args.hasArg(clang::driver::options::OPT_emit_hlfir)) { - if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir)) { + if (args.hasArg(clang::options::OPT_flang_deprecated_no_hlfir) && + !args.hasArg(clang::options::OPT_emit_hlfir)) { + if (args.hasArg(clang::options::OPT_flang_experimental_hlfir)) { const unsigned diagID = diags.getCustomDiagID( clang::DiagnosticsEngine::Error, "Options '-flang-experimental-hlfir' and " @@ -1611,13 +1579,13 @@ bool CompilerInvocation::createFromArgs( } // -fno-ppc-native-vector-element-order - if (args.hasArg(clang::driver::options::OPT_fno_ppc_native_vec_elem_order)) { + if (args.hasArg(clang::options::OPT_fno_ppc_native_vec_elem_order)) { invoc.loweringOpts.setNoPPCNativeVecElemOrder(true); } // -f[no-]init-global-zero - if (args.hasFlag(clang::driver::options::OPT_finit_global_zero, - clang::driver::options::OPT_fno_init_global_zero, + if (args.hasFlag(clang::options::OPT_finit_global_zero, + clang::options::OPT_fno_init_global_zero, /*default=*/true)) invoc.loweringOpts.setInitGlobalZero(true); else @@ -1626,8 +1594,8 @@ bool CompilerInvocation::createFromArgs( // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or // -Rpass-analysis. This will be used later when processing and outputting the // remarks generated by LLVM in ExecuteCompilerInvocation.cpp. - for (auto *a : args.filtered(clang::driver::options::OPT_R_Group)) { - if (a->getOption().matches(clang::driver::options::OPT_R_value_Group)) + for (auto *a : args.filtered(clang::options::OPT_R_Group)) { + if (a->getOption().matches(clang::options::OPT_R_value_Group)) // This is -Rfoo=, where foo is the name of the diagnostic // group. Add only the remark option name to the diagnostics. e.g. for // -Rpass= we will add the string "pass". @@ -1640,20 +1608,19 @@ bool CompilerInvocation::createFromArgs( } // -frealloc-lhs is the default. - if (!args.hasFlag(clang::driver::options::OPT_frealloc_lhs, - clang::driver::options::OPT_fno_realloc_lhs, true)) + if (!args.hasFlag(clang::options::OPT_frealloc_lhs, + clang::options::OPT_fno_realloc_lhs, true)) invoc.loweringOpts.setReallocateLHS(false); - invoc.loweringOpts.setRepackArrays( - args.hasFlag(clang::driver::options::OPT_frepack_arrays, - clang::driver::options::OPT_fno_repack_arrays, - /*default=*/false)); + invoc.loweringOpts.setRepackArrays(args.hasFlag( + clang::options::OPT_frepack_arrays, clang::options::OPT_fno_repack_arrays, + /*default=*/false)); invoc.loweringOpts.setStackRepackArrays( - args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays, - clang::driver::options::OPT_fno_stack_repack_arrays, + args.hasFlag(clang::options::OPT_fstack_repack_arrays, + clang::options::OPT_fno_stack_repack_arrays, /*default=*/false)); - if (auto *arg = args.getLastArg( - clang::driver::options::OPT_frepack_arrays_contiguity_EQ)) + if (auto *arg = + args.getLastArg(clang::options::OPT_frepack_arrays_contiguity_EQ)) invoc.loweringOpts.setRepackArraysWhole(arg->getValue() == llvm::StringRef{"whole"}); @@ -1673,10 +1640,8 @@ bool CompilerInvocation::createFromArgs( // `mlirArgs`. Instead, you can use // * `-mllvm `, or // * `-mmlir `. - invoc.frontendOpts.llvmArgs = - args.getAllArgValues(clang::driver::options::OPT_mllvm); - invoc.frontendOpts.mlirArgs = - args.getAllArgValues(clang::driver::options::OPT_mmlir); + invoc.frontendOpts.llvmArgs = args.getAllArgValues(clang::options::OPT_mllvm); + invoc.frontendOpts.mlirArgs = args.getAllArgValues(clang::options::OPT_mmlir); success &= parseLangOptionsArgs(invoc, args, diags); @@ -1700,7 +1665,7 @@ bool CompilerInvocation::createFromArgs( } // Process the timing-related options. - if (args.hasArg(clang::driver::options::OPT_ftime_report)) + if (args.hasArg(clang::options::OPT_ftime_report)) invoc.enableTimers = true; invoc.setArgv0(argv0); diff --git a/flang/lib/FrontendTool/CMakeLists.txt b/flang/lib/FrontendTool/CMakeLists.txt index faf56e9d955a1..b69436c36d438 100644 --- a/flang/lib/FrontendTool/CMakeLists.txt +++ b/flang/lib/FrontendTool/CMakeLists.txt @@ -18,5 +18,6 @@ add_flang_library(flangFrontendTool CLANG_LIBS clangBasic + clangOptions clangDriver ) diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index 09ac129d3e689..7586be59ba01b 100644 --- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -23,7 +23,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/Pass/PassManager.h" #include "clang/Basic/DiagnosticFrontend.h" -#include "clang/Driver/Options.h" +#include "clang/Options/Options.h" #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" #include "llvm/Support/BuryPointer.h" @@ -153,10 +153,10 @@ updateDiagEngineForOptRemarks(clang::DiagnosticsEngine &diagsEng, bool executeCompilerInvocation(CompilerInstance *flang) { // Honor -help. if (flang->getFrontendOpts().showHelp) { - clang::driver::getDriverOptTable().printHelp( + clang::getDriverOptTable().printHelp( llvm::outs(), "flang -fc1 [options] file...", "LLVM 'Flang' Compiler", /*ShowHidden=*/false, /*ShowAllAliases=*/false, - llvm::opt::Visibility(clang::driver::options::FC1Option)); + llvm::opt::Visibility(clang::options::FC1Option)); return true; } diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt index b5d6727025121..4dfc0d40cd55d 100644 --- a/flang/tools/flang-driver/CMakeLists.txt +++ b/flang/tools/flang-driver/CMakeLists.txt @@ -26,6 +26,7 @@ target_link_libraries(flang clang_target_link_libraries(flang PRIVATE clangDriver + clangOptions clangBasic ) diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp index bd878b7a642f1..0840255a739f3 100644 --- a/flang/tools/flang-driver/driver.cpp +++ b/flang/tools/flang-driver/driver.cpp @@ -52,9 +52,9 @@ createAndPopulateDiagOpts(llvm::ArrayRef argv) { // Any errors that would be diagnosed here will also be diagnosed later, // when the DiagnosticsEngine actually exists. unsigned missingArgIndex, missingArgCount; - llvm::opt::InputArgList args = clang::driver::getDriverOptTable().ParseArgs( + llvm::opt::InputArgList args = clang::getDriverOptTable().ParseArgs( argv.slice(1), missingArgIndex, missingArgCount, - llvm::opt::Visibility(clang::driver::options::FlangOption)); + llvm::opt::Visibility(clang::options::FlangOption)); (void)Fortran::frontend::parseDiagnosticArgs(*diagOpts, args); diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp index 8b4a3e0a7c3fb..bfbd85ea34203 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp @@ -56,7 +56,7 @@ using namespace lldb; using namespace lldb_private; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTTABLE_STR_TABLE_CODE static Status ExceptionMaskValidator(const char *string, void *unused) { @@ -1124,7 +1124,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType( #define OPTION(PREFIX_OFFSET, NAME_OFFSET, VAR, ...) \ llvm::StringRef opt_##VAR = OptionStrTable[NAME_OFFSET]; \ (void)opt_##VAR; -#include "clang/Driver/Options.inc" +#include "clang/Options/Options.inc" #undef OPTION minimum_version_option << '-'; switch (sdk_type) { diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index 7a2e9ad154f01..ea8ac6dbe6e34 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -401,7 +401,7 @@ class LLVM_ABI MCAsmInfo { // Generated object files can use all ELF features supported by GNU ld of // this binutils version and later. INT_MAX means all features can be used, // regardless of GNU ld support. The default value is referenced by - // clang/Driver/Options.td. + // clang/Options/Options.td. std::pair BinutilsVersion = {2, 26}; /// Should we use the integrated assembler? diff --git a/llvm/include/llvm/Option/Arg.h b/llvm/include/llvm/Option/Arg.h index b1e56b58da684..496373d28600f 100644 --- a/llvm/include/llvm/Option/Arg.h +++ b/llvm/include/llvm/Option/Arg.h @@ -51,7 +51,7 @@ class Arg { /// Was this argument used to affect compilation? /// /// This is used to generate an "argument unused" warning (without - /// clang::driver::options::TargetSpecific) or "unsupported option" error + /// clang::options::TargetSpecific) or "unsupported option" error /// (with TargetSpecific). mutable unsigned Claimed : 1; diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 4d279bffd3723..36b71dd49ef13 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1501,9 +1501,9 @@ cc_library( gentbl_cc_library( name = "driver_options_inc_gen", - tbl_outs = {"include/clang/Driver/Options.inc": ["-gen-opt-parser-defs"]}, + tbl_outs = {"include/clang/Options/Options.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", - td_file = "include/clang/Driver/Options.td", + td_file = "include/clang/Options/Options.td", deps = ["//llvm:OptParserTdFiles"], ) From 4ae7348513c700cf0a34a65094e925cba1084424 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Mon, 10 Nov 2025 12:32:43 -0800 Subject: [PATCH 02/30] [DirectX] Teach DXILResourceAccess about cbuffers (#164554) This isn't reachable today but will come into play once we reorder passes for #147352 and #147351. Note that the `CBufferRowIntrin` helper struct is copied from the `DXILCBufferAccess` pass, but it will be removed from there when we simplify that pass in #147351 --- .../lib/Target/DirectX/DXILResourceAccess.cpp | 156 ++++++++++++++++-- .../load-cbuffer-array-of-struct.ll | 59 +++++++ .../load-cbuffer-array-of-vector.ll | 49 ++++++ .../load-cbuffer-array-typedgep.ll | 30 ++++ .../ResourceAccess/load-cbuffer-arrays.ll | 145 ++++++++++++++++ .../load-cbuffer-dynamic-struct.ll | 64 +++++++ .../ResourceAccess/load-cbuffer-dynamic.ll | 46 ++++++ .../ResourceAccess/load-cbuffer-scalars.ll | 101 ++++++++++++ .../ResourceAccess/load-cbuffer-vectors.ll | 121 ++++++++++++++ 9 files changed, 761 insertions(+), 10 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll create mode 100644 llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp index 6579d3405cf39..057d87bc3c6a9 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp @@ -10,6 +10,7 @@ #include "DirectX.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DXILResource.h" +#include "llvm/Frontend/HLSL/HLSLResource.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -20,6 +21,7 @@ #include "llvm/IR/IntrinsicsDirectX.h" #include "llvm/IR/User.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Transforms/Utils/ValueMapper.h" #define DEBUG_TYPE "dxil-resource-access" @@ -44,16 +46,28 @@ static Value *calculateGEPOffset(GetElementPtrInst *GEP, Value *PrevOffset, APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0); if (GEP->accumulateConstantOffset(DL, ConstantOffset)) { APInt Scaled = ConstantOffset.udiv(ScalarSize); - return ConstantInt::get(Type::getInt32Ty(GEP->getContext()), Scaled); + return ConstantInt::get(DL.getIndexType(GEP->getType()), Scaled); } - auto IndexIt = GEP->idx_begin(); - assert(cast(IndexIt)->getZExtValue() == 0 && - "GEP is not indexing through pointer"); - ++IndexIt; - Value *Offset = *IndexIt; - assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP"); - return Offset; + unsigned NumIndices = GEP->getNumIndices(); + + // If we have a single index we're indexing into a top level array. This + // generally only happens with cbuffers. + if (NumIndices == 1) + return *GEP->idx_begin(); + + // If we have two indices, this should be a simple access through a pointer. + if (NumIndices == 2) { + auto IndexIt = GEP->idx_begin(); + assert(cast(IndexIt)->getZExtValue() == 0 && + "GEP is not indexing through pointer"); + ++IndexIt; + Value *Offset = *IndexIt; + assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP"); + return Offset; + } + + llvm_unreachable("Unhandled GEP structure for resource access"); } static void createTypedBufferStore(IntrinsicInst *II, StoreInst *SI, @@ -171,6 +185,127 @@ static void createRawLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset) { LI->replaceAllUsesWith(V); } +namespace { +/// Helper for building a `load.cbufferrow` intrinsic given a simple type. +struct CBufferRowIntrin { + Intrinsic::ID IID; + Type *RetTy; + unsigned int EltSize; + unsigned int NumElts; + + CBufferRowIntrin(const DataLayout &DL, Type *Ty) { + assert(Ty == Ty->getScalarType() && "Expected scalar type"); + + switch (DL.getTypeSizeInBits(Ty)) { + case 16: + IID = Intrinsic::dx_resource_load_cbufferrow_8; + RetTy = StructType::get(Ty, Ty, Ty, Ty, Ty, Ty, Ty, Ty); + EltSize = 2; + NumElts = 8; + break; + case 32: + IID = Intrinsic::dx_resource_load_cbufferrow_4; + RetTy = StructType::get(Ty, Ty, Ty, Ty); + EltSize = 4; + NumElts = 4; + break; + case 64: + IID = Intrinsic::dx_resource_load_cbufferrow_2; + RetTy = StructType::get(Ty, Ty); + EltSize = 8; + NumElts = 2; + break; + default: + llvm_unreachable("Only 16, 32, and 64 bit types supported"); + } + } +}; +} // namespace + +static void createCBufferLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset, + dxil::ResourceTypeInfo &RTI) { + const DataLayout &DL = LI->getDataLayout(); + + Type *Ty = LI->getType(); + assert(!isa(Ty) && "Structs not handled yet"); + CBufferRowIntrin Intrin(DL, Ty->getScalarType()); + + StringRef Name = LI->getName(); + Value *Handle = II->getOperand(0); + + IRBuilder<> Builder(LI); + + ConstantInt *GlobalOffset = dyn_cast(II->getOperand(1)); + assert(GlobalOffset && "CBuffer getpointer index must be constant"); + + unsigned int FixedOffset = GlobalOffset->getZExtValue(); + // If we have a further constant offset we can just fold it in to the fixed + // offset. + if (auto *ConstOffset = dyn_cast_if_present(Offset)) { + FixedOffset += ConstOffset->getZExtValue(); + Offset = nullptr; + } + + Value *CurrentRow = ConstantInt::get( + Builder.getInt32Ty(), FixedOffset / hlsl::CBufferRowSizeInBytes); + unsigned int CurrentIndex = + (FixedOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize; + + assert(!(CurrentIndex && Offset) && + "Dynamic indexing into elements of cbuffer rows is not supported"); + // At this point if we have a non-constant offset it has to be an array + // offset, so we can assume that it's a multiple of the row size. + if (Offset) + CurrentRow = FixedOffset ? Builder.CreateAdd(CurrentRow, Offset) : Offset; + + auto *CBufLoad = Builder.CreateIntrinsic( + Intrin.RetTy, Intrin.IID, {Handle, CurrentRow}, nullptr, Name + ".load"); + auto *Elt = + Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, Name + ".extract"); + + // At this point we've loaded the first scalar of our result, but our original + // type may have been a vector. + unsigned int Remaining = + ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1; + if (Remaining == 0) { + // We only have a single element, so we're done. + Value *Result = Elt; + + // However, if we loaded a <1 x T>, then we need to adjust the type. + if (auto *VT = dyn_cast(Ty)) { + assert(VT->getNumElements() == 1 && "Can't have multiple elements here"); + Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result, + Builder.getInt32(0), Name); + } + LI->replaceAllUsesWith(Result); + return; + } + + // Walk each element and extract it, wrapping to new rows as needed. + SmallVector Extracts{Elt}; + while (Remaining--) { + CurrentIndex %= Intrin.NumElts; + + if (CurrentIndex == 0) { + CurrentRow = Builder.CreateAdd(CurrentRow, + ConstantInt::get(Builder.getInt32Ty(), 1)); + CBufLoad = Builder.CreateIntrinsic(Intrin.RetTy, Intrin.IID, + {Handle, CurrentRow}, nullptr, + Name + ".load"); + } + + Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, + Name + ".extract")); + } + + // Finally, we build up the original loaded value. + Value *Result = PoisonValue::get(Ty); + for (int I = 0, E = Extracts.size(); I < E; ++I) + Result = Builder.CreateInsertElement( + Result, Extracts[I], Builder.getInt32(I), Name + formatv(".upto{}", I)); + LI->replaceAllUsesWith(Result); +} + static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI) { switch (RTI.getResourceKind()) { @@ -179,6 +314,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, case dxil::ResourceKind::RawBuffer: case dxil::ResourceKind::StructuredBuffer: return createRawLoad(II, LI, Offset); + case dxil::ResourceKind::CBuffer: + return createCBufferLoad(II, LI, Offset, RTI); case dxil::ResourceKind::Texture1D: case dxil::ResourceKind::Texture2D: case dxil::ResourceKind::Texture2DMS: @@ -190,9 +327,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, case dxil::ResourceKind::TextureCubeArray: case dxil::ResourceKind::FeedbackTexture2D: case dxil::ResourceKind::FeedbackTexture2DArray: - case dxil::ResourceKind::CBuffer: case dxil::ResourceKind::TBuffer: - // TODO: handle these + reportFatalUsageError("Load not yet implemented for resource type"); return; case dxil::ResourceKind::Sampler: case dxil::ResourceKind::RTAccelerationStructure: diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll new file mode 100644 index 0000000000000..22fba8c1d5f8c --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll @@ -0,0 +1,59 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for indexed types in dynamically indexed arrays in cbuffers. +; +; struct S { +; float x[2]; +; uint q; +; }; +; cbuffer CB : register(b0) { +; uint32_t3 w[3]; // offset 0, size 12 (+4) * 3 +; S v[3]; // offset 48, size 24 (+8) * 3 +; } +%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }> +%__cblayout_CB = type <{ + <{ + [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], + <3 x i32> + }>, + target("dx.Padding", 4), + <{ + [2 x <{ %S, target("dx.Padding", 8) }>], %S + }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; w[2].z + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: store i32 [[X]], ptr %dst + %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_ptr, i32 40 + %w_load = load i32, ptr addrspace(2) %w_gep, align 4 + store i32 %w_load, ptr %dst, align 4 + + ;; v[2].q + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_ptr, i32 84 + %v_load = load i32, ptr addrspace(2) %v_gep, align 4 + %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %v_load, ptr %v.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll new file mode 100644 index 0000000000000..615fc5ea07eca --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Test for when we have indices into both the array and the vector: ie, s[1][3] + +; cbuffer CB : register(b0) { +; uint4 s[3]; // offset 0, size 16 * 3 +; } +%__cblayout_CB = type <{ [2 x <4 x i32>] }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; s[1][3] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: store i32 [[X]], ptr %dst + %i8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %i8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %i8_ptr, i32 28 + %i8_vecext = load i32, ptr addrspace(2) %i8_gep, align 4 + store i32 %i8_vecext, ptr %dst, align 4 + + ;; s[2].w + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ;; + ;; It would be nice to avoid the redundant vector creation here, but that's + ;; outside of the scope of this pass. + ;; + ; CHECK: [[X_VEC:%.*]] = insertelement <4 x i32> {{%.*}}, i32 [[X]], i32 3 + ; CHECK: [[X_EXT:%.*]] = extractelement <4 x i32> [[X_VEC]], i32 3 + ; CHECK: store i32 [[X_EXT]], ptr %dst + %typed_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %typed_gep = getelementptr <4 x i32>, ptr addrspace(2) %typed_ptr, i32 2 + %typed_load = load <4 x i32>, ptr addrspace(2) %typed_gep, align 16 + %typed_vecext = extractelement <4 x i32> %typed_load, i32 3 + store i32 %typed_vecext, ptr %dst, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll new file mode 100644 index 0000000000000..eabc07c2fbb68 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll @@ -0,0 +1,30 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; +; } +%__cblayout_CB = type <{ [2 x <{ float, [12 x i8] }>], float }> + +@CB.cb = global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h = call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ;; a1[1] + ;; Note that the valid GEPs of a1 are `0, 0, 0`, `0, 0, 1`, and `0, 1`. + ; + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1_gep = getelementptr inbounds <{ [2 x <{ float, [12 x i8] }>], float }>, ptr addrspace(2) %a1_ptr, i32 0, i32 0, i32 1 + %a1 = load float, ptr addrspace(2) %a1_gep, align 4 + store float %a1, ptr %dst, align 32 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll new file mode 100644 index 0000000000000..6f6166e820a6f --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll @@ -0,0 +1,145 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; // offset 0, size 4 (+12) * 3 +; double3 a2[2]; // offset 48, size 24 (+8) * 2 +; float16_t a3[2][2]; // offset 112, size 2 (+14) * 4 +; uint64_t a4[3]; // offset 176, size 8 (+8) * 3 +; int4 a5[2][3][4]; // offset 224, size 16 * 24 +; uint16_t a6[1]; // offset 608, size 2 (+14) * 1 +; int64_t a7[2]; // offset 624, size 8 (+8) * 2 +; bool a8[4]; // offset 656, size 4 (+12) * 4 +; } +%__cblayout_CB = type <{ + <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>, target("dx.Padding", 12), + <{ [1 x <{ <3 x double>, target("dx.Padding", 8) }>], <3 x double> }>, target("dx.Padding", 8), + <{ [3 x <{ half, target("dx.Padding", 14) }>], half }>, target("dx.Padding", 14), + <{ [2 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8), + [24 x <4 x i32>], + [1 x i16], target("dx.Padding", 14), + <{ [1 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8), + <{ [3 x <{ i32, target("dx.Padding", 12) }>], i32 }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; a1[1] + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a1_ptr, i32 16 + %a1 = load float, ptr addrspace(2) %a1_gep, align 4 + store float %a1, ptr %dst, align 32 + + ;; a2[1] + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %a2_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a2_ptr, i32 32 + %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 8 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store <3 x double> %a2, ptr %a2.i, align 32 + + ;; a3[0][1] + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 32 + ; CHECK: store half [[X]], ptr [[PTR]] + %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 112) + %a3_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a3_ptr, i32 16 + %a3 = load half, ptr addrspace(2) %a3_gep, align 2 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 32 + store half %a3, ptr %a3.i, align 2 + + ;; a4[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 12) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 176) + %a4_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a4_ptr, i32 16 + %a4 = load i64, ptr addrspace(2) %a4_gep, align 8 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store i64 %a4, ptr %a4.i, align 8 + + ;; a5[1][0][0] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 26) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 224) + %a5_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a5_ptr, i32 192 + %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 4 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ;; a6[0] + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 38) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 64 + ; CHECK: store i16 [[X]], ptr [[PTR]] + %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 608) + %a6 = load i16, ptr addrspace(2) %a6_ptr, align 2 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 64 + store i16 %a6, ptr %a6.i, align 2 + + ;; a7[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 40) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 624) + %a7_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a7_ptr, i32 16 + %a7 = load i64, ptr addrspace(2) %a7_gep, align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store i64 %a7, ptr %a7.i, align 8 + + ;; a8[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 42) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 80 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %a8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 656) + %a8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a8_ptr, i32 16 + %a8 = load i32, ptr addrspace(2) %a8_gep, align 4, !range !0, !noundef !1 + %a8.i = getelementptr inbounds nuw i8, ptr %dst, i32 80 + store i32 %a8, ptr %a8.i, align 4 + + ret void +} + +!0 = !{i32 0, i32 2} +!1 = !{} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll new file mode 100644 index 0000000000000..22994cfc3f48a --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll @@ -0,0 +1,64 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for indexed types in dynamically indexed arrays in cbuffers. +; +; Bug https://github.com/llvm/llvm-project/issues/164517 +; XFAIL: * +; +; struct S { +; float x[2]; +; uint q; +; }; +; cbuffer CB : register(b0) { +; uint32_t3 w[3]; // offset 0, size 12 (+4) * 3 +; S v[3]; // offset 48, size 24 (+8) * 3 +; } +%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }> +%__cblayout_CB = type <{ + <{ + [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], + <3 x i32> + }>, + target("dx.Padding", 4), + <{ + [2 x <{ %S, target("dx.Padding", 8) }>], %S + }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; w[idx].z + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: store i32 [[X]], ptr %dst + %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %w_arrayidx = getelementptr <3 x i32>, ptr addrspace(2) %w_ptr, i32 %idx + %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_arrayidx, i32 4 + %w_load = load i32, ptr addrspace(2) %w_gep, align 4 + store i32 %w_load, ptr %dst, align 4 + + ;; v[idx].q + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %v_arrayidx = getelementptr <{ %struct.S, target("dx.Padding", 4) }>, ptr addrspace(2) %v_ptr, i32 %idx + %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_arrayidx, i32 8 + %v_load = load i32, ptr addrspace(2) %v_gep, align 4 + %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %v_load, ptr %v.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll new file mode 100644 index 0000000000000..7daebaed70442 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll @@ -0,0 +1,46 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for dynamic indices into arrays in cbuffers. + +; cbuffer CB : register(b0) { +; uint s[10]; // offset 0, size 4 (+12) * 10 +; uint t[12]; // offset 160, size 4 (+12) * 12 +; } +%__cblayout_CB = type <{ <{ [9 x <{ i32, target("dx.Padding", 12) }>], i32 }>, target("dx.Padding", 12), <{ [11 x <{ i32, target("dx.Padding", 12) }>], i32 }> }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; s[idx] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: store i32 [[X]], ptr %dst + %s_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %s_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %s_ptr, i32 %idx + %s_load = load i32, ptr addrspace(2) %s_gep, align 4 + store i32 %s_load, ptr %dst, align 4 + + ;; t[idx] + ; + ; CHECK: [[T_IDX:%.*]] = add i32 10, %idx + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 [[T_IDX]]) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %t_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 160) + %t_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %t_ptr, i32 %idx + %t_load = load i32, ptr addrspace(2) %t_gep, align 4 + %t.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %t_load, ptr %t.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll new file mode 100644 index 0000000000000..65c9a3ec966e9 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll @@ -0,0 +1,101 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB { +; float a1; // offset 0, size 4 +; int a2; // offset 4, size 4 +; bool a3; // offset 8, size 4 +; float16_t a4; // offset 12, size 2 +; uint16_t a5; // offset 14, size 2 +; double a6; // offset 16, size 8 +; int64_t a7; // offset 24, size 8 +; } +%__cblayout_CB = type <{ float, i32, i32, half, i16, double, i64 }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + + ;; a1 + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A1:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[A1]], ptr %dst + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1 = load float, ptr addrspace(2) %a1_ptr, align 4 + store float %a1, ptr %dst, align 8 + + ;; a2 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A2:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[A2]], ptr [[PTR]] + %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 4) + %a2 = load i32, ptr addrspace(2) %a2_ptr, align 4 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %a2, ptr %a2.i, align 8 + + ;; a3 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A3:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store i32 [[A3]], ptr [[PTR]] + %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 8) + %a3 = load i32, ptr addrspace(2) %a3_ptr, align 4 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store i32 %a3, ptr %a3.i, align 4 + + ;; a4 + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A4:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 6 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 12 + ; CHECK: store half [[A4]], ptr [[PTR]] + %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 12) + %a4 = load half, ptr addrspace(2) %a4_ptr, align 2 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 12 + store half %a4, ptr %a4.i, align 4 + + ;; a5 + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A5:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 7 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 14 + ; CHECK: store i16 [[A5]], ptr [[PTR]] + %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 14) + %a5 = load i16, ptr addrspace(2) %a5_ptr, align 2 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 14 + store i16 %a5, ptr %a5.i, align 2 + + ;; a6 + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[A6:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store double [[A6]], ptr [[PTR]] + %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16) + %a6 = load double, ptr addrspace(2) %a6_ptr, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store double %a6, ptr %a6.i, align 8 + + ;; a7 + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[A7:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 24 + ; CHECK: store i64 [[A7]], ptr [[PTR]] + %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 24) + %a7 = load i64, ptr addrspace(2) %a7_ptr, align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 24 + store i64 %a7, ptr %a7.i, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll new file mode 100644 index 0000000000000..0156a1a0472ab --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll @@ -0,0 +1,121 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB { +; float3 a1; // offset 0, size 12 (+4) +; double3 a2; // offset 16, size 24 +; float16_t2 a3; // offset 40, size 4 (+4) +; uint64_t3 a4; // offset 48, size 24 (+8) +; int4 a5; // offset 80, size 16 +; uint16_t3 a6; // offset 96, size 6 +; }; +%__cblayout_CB = type <{ <3 x float>, target("dx.Padding", 4), <3 x double>, <2 x half>, target("dx.Padding", 4), <3 x i64>, target("dx.Padding", 8), <4 x i32>, <3 x i16> }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + + ;; a1 + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x float> poison, float [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x float> [[VEC0]], float [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x float> [[VEC1]], float [[Z]], i32 2 + ; CHECK: store <3 x float> [[VEC2]], ptr %dst + %a1_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1 = load <3 x float>, ptr addrspace(2) %a1_gep, align 16 + store <3 x float> %a1, ptr %dst, align 4 + + ;; a2 + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16) + %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 32 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store <3 x double> %a2, ptr %a2.i, align 8 + + ;; a3 + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 4 + ; CHECK: [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 5 + ; CHECK: [[VEC0:%.*]] = insertelement <2 x half> poison, half [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <2 x half> [[VEC0]], half [[Y]], i32 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store <2 x half> [[VEC1]], ptr [[PTR]] + %a3_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 40) + %a3 = load <2 x half>, ptr addrspace(2) %a3_gep, align 4 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store <2 x half> %a3, ptr %a3.i, align 2 + + ;; a4 + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 3) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 4) + ; CHECK: [[Z:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i64> poison, i64 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i64> [[VEC0]], i64 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i64> [[VEC1]], i64 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <3 x i64> [[VEC2]], ptr [[PTR]] + %a4_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %a4 = load <3 x i64>, ptr addrspace(2) %a4_gep, align 32 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <3 x i64> %a4, ptr %a4.i, align 8 + + ;; a5 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 80) + %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 16 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ;; a6 + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i16> poison, i16 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i16> [[VEC0]], i16 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i16> [[VEC1]], i16 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 88 + ; CHECK: store <3 x i16> [[VEC2]], ptr [[PTR]] + %a6_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 96) + %a6 = load <3 x i16>, ptr addrspace(2) %a6_gep, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 88 + store <3 x i16> %a6, ptr %a6.i, align 2 + + ret void +} From 793ab6a80d22a2357e5fdbc1e9f51d0ed8dd5a4f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Nov 2025 12:34:00 -0800 Subject: [PATCH 03/30] X86: Enable terminal rule (#165957) --- llvm/lib/Target/X86/X86Subtarget.h | 2 + llvm/test/CodeGen/X86/3addr-16bit.ll | 48 +- llvm/test/CodeGen/X86/atomic-rm-bit-test.ll | 22 +- .../CodeGen/X86/atomicrmw-fadd-fp-vector.ll | 3 +- llvm/test/CodeGen/X86/bitcast-vector-bool.ll | 32 +- .../X86/coalescer-dead-flag-verifier-error.ll | 4 +- llvm/test/CodeGen/X86/fold-loop-of-urem.ll | 81 +- llvm/test/CodeGen/X86/freeze-binary.ll | 26 +- llvm/test/CodeGen/X86/i128-mul.ll | 178 +- llvm/test/CodeGen/X86/icmp-abs-C.ll | 22 +- .../test/CodeGen/X86/masked_gather_scatter.ll | 12 +- llvm/test/CodeGen/X86/midpoint-int.ll | 28 +- llvm/test/CodeGen/X86/mmx-arith.ll | 3 +- llvm/test/CodeGen/X86/mul-constant-i16.ll | 8 +- llvm/test/CodeGen/X86/mul-constant-i32.ll | 16 +- llvm/test/CodeGen/X86/mul-constant-i8.ll | 4 +- llvm/test/CodeGen/X86/optimize-max-0.ll | 211 +- llvm/test/CodeGen/X86/parity.ll | 30 +- llvm/test/CodeGen/X86/pr166744.ll | 14 +- llvm/test/CodeGen/X86/rotate-extract.ll | 4 +- llvm/test/CodeGen/X86/smul_fix.ll | 8 +- llvm/test/CodeGen/X86/sshl_sat.ll | 40 +- llvm/test/CodeGen/X86/sshl_sat_vec.ll | 113 +- llvm/test/CodeGen/X86/stackmap.ll | 9 +- .../subvectorwise-store-of-vector-splat.ll | 210 +- llvm/test/CodeGen/X86/twoaddr-lea.ll | 2 +- llvm/test/CodeGen/X86/umul_fix.ll | 8 +- llvm/test/CodeGen/X86/ushl_sat.ll | 28 +- llvm/test/CodeGen/X86/ushl_sat_vec.ll | 111 +- .../CodeGen/X86/vector-mulfix-legalize.ll | 34 +- .../CodeGen/X86/vector-reduce-xor-bool.ll | 160 +- ...lar-shift-by-byte-multiple-legalization.ll | 6081 ++++++++--------- .../X86/wide-scalar-shift-legalization.ll | 1344 ++-- ...ad-of-small-alloca-with-zero-upper-half.ll | 328 +- .../CodeGen/X86/widen-load-of-small-alloca.ll | 95 +- llvm/test/CodeGen/X86/x86-shrink-wrapping.ll | 18 +- llvm/test/CodeGen/X86/xor.ll | 132 +- .../LoopStrengthReduce/X86/ivchain-X86.ll | 21 +- 38 files changed, 4728 insertions(+), 4762 deletions(-) diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 868f41375b96b..4f5aadca361fe 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -419,6 +419,8 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } + bool enableTerminalRule() const override { return true; } + bool enableEarlyIfConversion() const override; void getPostRAMutations(std::vector> diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll index c9390d91d59c2..2b692bff0461e 100644 --- a/llvm/test/CodeGen/X86/3addr-16bit.ll +++ b/llvm/test/CodeGen/X86/3addr-16bit.ll @@ -10,27 +10,27 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test1: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: incl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: incl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB0_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB0_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test1: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: incl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB0_2 @@ -63,27 +63,27 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: decl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: decl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB1_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB1_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: decl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB1_2 @@ -118,27 +118,27 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test3: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl $2, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl $2, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB2_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB2_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test3: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl $2, %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB2_2 @@ -171,19 +171,19 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test4: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl %edi, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB3_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB3_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test4: @@ -191,8 +191,8 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: cmpw %cx, %dx ; X86-NEXT: jne LBB3_2 diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll index b4d40fee01e41..71887e369bd18 100644 --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll @@ -2156,15 +2156,17 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_brz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 ; X64-NEXT: .LBB34_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: xorl %edx, %ecx +; X64-NEXT: xorl %esi, %ecx ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: lock cmpxchgw %cx, (%rdi) ; X64-NEXT: # kill: def $ax killed $ax def $eax @@ -2172,12 +2174,12 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %ecx ; X64-NEXT: movw $123, %ax -; X64-NEXT: testl %ecx, %edx +; X64-NEXT: testl %ecx, %esi ; X64-NEXT: je .LBB34_3 ; X64-NEXT: # %bb.4: # %return ; X64-NEXT: retq ; X64-NEXT: .LBB34_3: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq entry: @@ -3398,10 +3400,12 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_brnz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movl $-2, %r8d +; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: roll %cl, %r8d ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 @@ -3415,10 +3419,10 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-NEXT: jne .LBB52_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: testl %eax, %edx +; X64-NEXT: testl %eax, %esi ; X64-NEXT: je .LBB52_3 ; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq ; X64-NEXT: .LBB52_3: diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll index 105ee7f82ee79..e118f5dbc1534 100644 --- a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll +++ b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll @@ -46,8 +46,9 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x ; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: lock cmpxchgl %ecx, (%rbx) ; CHECK-NEXT: setne %cl -; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: pinsrw $0, %edx, %xmm0 ; CHECK-NEXT: pinsrw $0, %eax, %xmm1 ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne .LBB0_1 diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 86d7df0c2d648..fae1ff90dd8d5 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -216,8 +216,8 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind { define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; SSE-LABEL: bitcast_v16i8_to_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -225,8 +225,8 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; ; AVX12-LABEL: bitcast_v16i8_to_v2i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpmovmskb %xmm0, %ecx -; AVX12-NEXT: movl %ecx, %eax +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: movl %eax, %ecx ; AVX12-NEXT: shrl $8, %eax ; AVX12-NEXT: addb %cl, %al ; AVX12-NEXT: # kill: def $al killed $al killed $eax @@ -441,8 +441,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: bitcast_v16i16_to_v2i8: ; SSE: # %bb.0: ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -452,8 +452,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -464,8 +464,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -762,8 +762,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -776,8 +776,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -793,8 +793,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll index 4d41c8406f6e0..a42a715bdc6ab 100644 --- a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll +++ b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll @@ -7,8 +7,8 @@ define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0_(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0_: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .p2align 4 @@ -68,8 +68,8 @@ _ZNK4llvm5APInt13getActiveBitsEv.exit.i.i: ; preds = %for.body.i.i.i.i.i define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0__assert(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0__assert: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB1_1 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll index c1beb7c803b2b..c9c88f7258435 100644 --- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll +++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll @@ -1031,31 +1031,30 @@ define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind { ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: je .LBB17_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $1, %r15d +; CHECK-NEXT: movl $1, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB17_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB17_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB17_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1199,32 +1198,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_ ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB21_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB21_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB21_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB21_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1251,32 +1249,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB22_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB22_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB22_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB22_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1303,31 +1300,30 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem( ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB23_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB23_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB23_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB23_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1404,32 +1400,31 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3 ; CHECK-NEXT: cmpl %edx, %edi ; CHECK-NEXT: jbe .LBB25_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movl %edx, %r15d -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edx, %ebx +; CHECK-NEXT: movl %esi, %ebp ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: addl $-2, %r15d +; CHECK-NEXT: addl $-2, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB25_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %ebx +; CHECK-NEXT: divl %ebp ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $-2, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB25_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB25_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index e223765eb887b..46b2571e196bb 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -490,20 +490,21 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind { define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_ashr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sarl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sarl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $6, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: sarl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: sarl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: sarl $6, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 %y = freeze i32 %x @@ -604,20 +605,21 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind { define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_lshr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: shrl $5, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: shrl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrl $5, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 %y = freeze i32 %x diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index cffd88c55bb0a..477a0dce5c81c 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -111,62 +111,63 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: orl %ecx, %eax ; X86-NOBMI-NEXT: je .LBB1_3 ; X86-NOBMI-NEXT: # %bb.1: # %for.body.preheader -; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: xorl %edx, %edx +; X86-NOBMI-NEXT: xorl %esi, %esi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: movl $0, (%esp) # 4-byte Folded Spill +; X86-NOBMI-NEXT: xorl %edi, %edi +; X86-NOBMI-NEXT: xorl %ebp, %ebp ; X86-NOBMI-NEXT: .p2align 4 ; X86-NOBMI-NEXT: .LBB1_2: # %for.body ; X86-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %edi -; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx +; X86-NOBMI-NEXT: movl (%eax,%edi,8), %ebp +; X86-NOBMI-NEXT: movl 4(%eax,%edi,8), %ebx ; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %edi, %eax -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %ebp, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %ebx, %eax -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebx -; X86-NOBMI-NEXT: movl %eax, %esi -; X86-NOBMI-NEXT: addl %ebp, %esi -; X86-NOBMI-NEXT: adcl $0, %ebx -; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: movl %eax, %ebx +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl $0, %edx +; X86-NOBMI-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx -; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %eax, %edi -; X86-NOBMI-NEXT: addl %esi, %edi -; X86-NOBMI-NEXT: adcl %ebx, %ebp -; X86-NOBMI-NEXT: setb %bl +; X86-NOBMI-NEXT: movl %eax, %ebp +; X86-NOBMI-NEXT: addl %ebx, %ebp +; X86-NOBMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ebx +; X86-NOBMI-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOBMI-NEXT: addl %ebp, %eax -; X86-NOBMI-NEXT: movzbl %bl, %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NOBMI-NEXT: adcl %esi, %edx -; X86-NOBMI-NEXT: movl %ecx, %ebx -; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl $0, %eax -; X86-NOBMI-NEXT: adcl $0, %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8) -; X86-NOBMI-NEXT: movl %ebx, %ecx -; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8) -; X86-NOBMI-NEXT: addl $1, %ecx -; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-NOBMI-NEXT: adcl $0, %edi -; X86-NOBMI-NEXT: movl %ecx, %esi -; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NOBMI-NEXT: xorl %ebp, %edi -; X86-NOBMI-NEXT: orl %esi, %edi +; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: addl %ebx, %esi +; X86-NOBMI-NEXT: movl %ecx, %eax +; X86-NOBMI-NEXT: movzbl (%esp), %ebx # 1-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ecx +; X86-NOBMI-NEXT: adcl %ebx, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl %eax, %ebp +; X86-NOBMI-NEXT: adcl $0, %esi +; X86-NOBMI-NEXT: adcl $0, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl %edx, (%eax,%edi,8) +; X86-NOBMI-NEXT: movl %ebp, 4(%eax,%edi,8) +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: addl $1, %edi +; X86-NOBMI-NEXT: adcl $0, %ebp +; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: xorl %edx, %eax +; X86-NOBMI-NEXT: movl %ebp, %edx +; X86-NOBMI-NEXT: xorl %ebx, %edx +; X86-NOBMI-NEXT: orl %eax, %edx ; X86-NOBMI-NEXT: jne .LBB1_2 ; X86-NOBMI-NEXT: .LBB1_3: # %for.end ; X86-NOBMI-NEXT: xorl %eax, %eax @@ -184,71 +185,66 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: pushl %ebx ; X86-BMI-NEXT: pushl %edi ; X86-BMI-NEXT: pushl %esi -; X86-BMI-NEXT: subl $20, %esp +; X86-BMI-NEXT: subl $16, %esp ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: orl %ecx, %eax ; X86-BMI-NEXT: je .LBB1_3 ; X86-BMI-NEXT: # %bb.1: # %for.body.preheader -; X86-BMI-NEXT: xorl %ecx, %ecx -; X86-BMI-NEXT: xorl %eax, %eax +; X86-BMI-NEXT: xorl %esi, %esi +; X86-BMI-NEXT: xorl %edi, %edi ; X86-BMI-NEXT: xorl %ebx, %ebx -; X86-BMI-NEXT: xorl %ebp, %ebp +; X86-BMI-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-BMI-NEXT: .p2align 4 ; X86-BMI-NEXT: .LBB1_2: # %for.body ; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: movl (%eax,%ebx,8), %ecx -; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %esi -; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %ebp +; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: mulxl %eax, %edx, %edi +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %edx, %eax +; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %eax, %esi, %eax -; X86-BMI-NEXT: addl %edi, %esi -; X86-BMI-NEXT: adcl $0, %eax +; X86-BMI-NEXT: movl %ebp, %edx +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %eax, %ebp +; X86-BMI-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, %ebp ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp -; X86-BMI-NEXT: addl %esi, %edi -; X86-BMI-NEXT: adcl %eax, %ebp +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %ecx, %edx +; X86-BMI-NEXT: addl %eax, %ecx +; X86-BMI-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl %esi, %eax +; X86-BMI-NEXT: adcl %ebp, %edx +; X86-BMI-NEXT: movl %edx, %ebp ; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %esi, %edi ; X86-BMI-NEXT: setb %dl -; X86-BMI-NEXT: addl %ebp, %ecx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI-NEXT: addl %ebp, %esi ; X86-BMI-NEXT: movzbl %dl, %edx -; X86-BMI-NEXT: adcl %edx, %eax -; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-BMI-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X86-BMI-NEXT: adcl $0, %ecx -; X86-BMI-NEXT: adcl $0, %edx -; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI-NEXT: movl %eax, (%edx,%ebx,8) -; X86-BMI-NEXT: movl %edi, 4(%edx,%ebx,8) -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-BMI-NEXT: adcl %edx, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-BMI-NEXT: addl %eax, %edx +; X86-BMI-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, %esi +; X86-BMI-NEXT: adcl $0, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %edx, (%eax,%ebx,8) +; X86-BMI-NEXT: movl %ecx, 4(%eax,%ebx,8) ; X86-BMI-NEXT: addl $1, %ebx -; X86-BMI-NEXT: adcl $0, %ebp -; X86-BMI-NEXT: movl %ebx, %edx -; X86-BMI-NEXT: xorl %esi, %edx -; X86-BMI-NEXT: movl %ebp, %esi -; X86-BMI-NEXT: xorl %edi, %esi -; X86-BMI-NEXT: orl %edx, %esi -; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-BMI-NEXT: adcl $0, %ecx +; X86-BMI-NEXT: movl %ebx, %eax +; X86-BMI-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: xorl %ebp, %ecx +; X86-BMI-NEXT: orl %eax, %ecx ; X86-BMI-NEXT: jne .LBB1_2 ; X86-BMI-NEXT: .LBB1_3: # %for.end ; X86-BMI-NEXT: xorl %eax, %eax ; X86-BMI-NEXT: xorl %edx, %edx -; X86-BMI-NEXT: addl $20, %esp +; X86-BMI-NEXT: addl $16, %esp ; X86-BMI-NEXT: popl %esi ; X86-BMI-NEXT: popl %edi ; X86-BMI-NEXT: popl %ebx @@ -261,11 +257,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-NOBMI-NEXT: je .LBB1_3 ; X64-NOBMI-NEXT: # %bb.1: # %for.body.preheader ; X64-NOBMI-NEXT: movq %rdx, %r8 -; X64-NOBMI-NEXT: xorl %r10d, %r10d +; X64-NOBMI-NEXT: xorl %edx, %edx ; X64-NOBMI-NEXT: xorl %r9d, %r9d ; X64-NOBMI-NEXT: .p2align 4 ; X64-NOBMI-NEXT: .LBB1_2: # %for.body ; X64-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: movq %rcx, %rax ; X64-NOBMI-NEXT: mulq (%r8,%r9,8) ; X64-NOBMI-NEXT: addq %r10, %rax @@ -273,7 +270,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-NOBMI-NEXT: movq %rax, (%rsi,%r9,8) ; X64-NOBMI-NEXT: incq %r9 ; X64-NOBMI-NEXT: cmpq %r9, %rdi -; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: jne .LBB1_2 ; X64-NOBMI-NEXT: .LBB1_3: # %for.end ; X64-NOBMI-NEXT: xorl %eax, %eax @@ -285,11 +281,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-BMI-NEXT: je .LBB1_3 ; X64-BMI-NEXT: # %bb.1: # %for.body.preheader ; X64-BMI-NEXT: movq %rdx, %rax -; X64-BMI-NEXT: xorl %r9d, %r9d +; X64-BMI-NEXT: xorl %edx, %edx ; X64-BMI-NEXT: xorl %r8d, %r8d ; X64-BMI-NEXT: .p2align 4 ; X64-BMI-NEXT: .LBB1_2: # %for.body ; X64-BMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: movq %rcx, %rdx ; X64-BMI-NEXT: mulxq (%rax,%r8,8), %r10, %rdx ; X64-BMI-NEXT: addq %r9, %r10 @@ -297,7 +294,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-BMI-NEXT: movq %r10, (%rsi,%r8,8) ; X64-BMI-NEXT: incq %r8 ; X64-BMI-NEXT: cmpq %r8, %rdi -; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: jne .LBB1_2 ; X64-BMI-NEXT: .LBB1_3: # %for.end ; X64-BMI-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll index 53b70fa38958b..c98889b7d5cb3 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -161,22 +161,22 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind { ; X86-LABEL: ne_and_with_dom_abs: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movswl %ax, %ecx +; X86-NEXT: sarl $15, %ecx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: xorl $12312, %eax # imm = 0x3018 ; X86-NEXT: movzwl %ax, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpw $64, %cx -; X86-NEXT: setne %cl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpw $64, %dx +; X86-NEXT: setne %dl ; X86-NEXT: cmpl $2345, %esi # imm = 0x929 ; X86-NEXT: jae .LBB3_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movb %dl, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB3_2: ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index caec02eaa19c7..2f691e7ca8f5b 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -255,9 +255,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} -; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} -; X64-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X64-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} +; X64-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test7: @@ -271,9 +271,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-KNL-NEXT: kmovw %k1, %k2 ; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} -; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} -; X86-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} +; X86-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X86-KNL-NEXT: retl ; ; X64-SKX-LABEL: test7: diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index a75d42ed0c50f..c058e37e0ce11 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -658,9 +658,9 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -710,9 +710,9 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setbe %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -765,9 +765,9 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %ecx +; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -817,11 +817,11 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_reg_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -871,12 +871,12 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_mem_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 73d459ba77026..8f97d2652bc53 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -403,11 +403,11 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: je .LBB3_1 ; X86-NEXT: # %bb.2: # %bb26.preheader ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_3: # %bb26 @@ -427,7 +427,6 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: jb .LBB3_3 ; X86-NEXT: jmp .LBB3_4 ; X86-NEXT: .LBB3_1: -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB3_4: # %bb31 ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll index b1aa789e53cd7..a663f6a1dd376 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i16.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll @@ -715,8 +715,8 @@ define i16 @test_mul_by_66(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 66 @@ -757,8 +757,8 @@ define i16 @test_mul_by_520(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $9, %eax -; X64-NEXT: leal (%rax,%rdi,8), %eax +; X64-NEXT: shll $9, %edi +; X64-NEXT: leal (%rdi,%rax,8), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 520 diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll index 79889b9ace406..4129b44ed3ddc 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i32.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll @@ -1155,16 +1155,16 @@ define i32 @test_mul_by_66(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $6, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax +; X64-HSW-NEXT: shll $6, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_66: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $6, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax +; X64-JAG-NEXT: shll $6, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_66: @@ -1241,16 +1241,16 @@ define i32 @test_mul_by_520(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $9, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,8), %eax +; X64-HSW-NEXT: shll $9, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,8), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_520: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $9, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,8), %eax +; X64-JAG-NEXT: shll $9, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,8), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_520: diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll index a4fa1ee8c0029..b488653655728 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i8.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -425,8 +425,8 @@ define i8 @test_mul_by_66(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 66 diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index 283c00e17f21a..b6af7e1641a9c 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: imull %ebp, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: imull %esi, %eax ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %ecx, (%esp) ## 4-byte Spill +; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: shrl $30, %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: sarl $2, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl %eax, %ebp +; CHECK-NEXT: sarl $31, %ebp +; CHECK-NEXT: shrl $30, %ebp +; CHECK-NEXT: addl %eax, %ebp +; CHECK-NEXT: sarl $2, %ebp +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_4: ## %bb6 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx -; CHECK-NEXT: movb %bl, (%edx,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %ebp, %esi +; CHECK-NEXT: movzbl (%eax,%edi,2), %ebx +; CHECK-NEXT: movb %bl, (%edx,%edi) +; CHECK-NEXT: incl %edi +; CHECK-NEXT: cmpl %esi, %edi ; CHECK-NEXT: jl LBB0_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: cmpl %edi, %ecx +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: je LBB0_12 ; CHECK-NEXT: ## %bb.6: ## %bb7.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: jmp LBB0_4 ; CHECK-NEXT: LBB0_12: ## %bb18.loopexit +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl (%esp), %eax ## 4-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %ebp, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: jle LBB0_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, %ebp +; CHECK-NEXT: cmpl $2, %esi ; CHECK-NEXT: jl LBB0_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl %ebp, %edx -; CHECK-NEXT: shrl $31, %edx -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: sarl %edx +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: shrl $31, %ebp +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: sarl %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $31, %ecx @@ -84,102 +84,103 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_10 Depth 2 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %edx, %edi ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB0_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%ecx,%esi) -; CHECK-NEXT: movzbl (%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%eax,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %edx, %esi +; CHECK-NEXT: movzbl -2(%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%ecx,%ebx) +; CHECK-NEXT: movzbl (%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%eax,%ebx) +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: cmpl %ebp, %ebx ; CHECK-NEXT: jl LBB0_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB0_9 Depth=1 ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: incl %edi -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: addl %ebp, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; CHECK-NEXT: jl LBB0_9 ; CHECK-NEXT: LBB0_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpl $1, %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %eax +; CHECK-NEXT: cmpl $3, %ecx ; CHECK-NEXT: jne LBB0_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; CHECK-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph -; CHECK-NEXT: leal 15(%edi), %eax +; CHECK-NEXT: leal 15(%edx), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebx, %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: addl %ecx, %ebx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: addl %ebp, %ebp +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: addl %ecx, %ebp +; CHECK-NEXT: addl %eax, %ebx +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi ; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %ebp, %ebx +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edi +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_17 ; CHECK-NEXT: LBB0_18: ## %bb26 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %esi, %edx ; CHECK-NEXT: jmp LBB0_23 ; CHECK-NEXT: LBB0_19: ## %bb29 -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 -; CHECK-NEXT: movl %edi, %esi -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -187,30 +188,32 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: LBB0_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx +; CHECK-NEXT: movl %ebp, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %edi -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %esi +; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: LBB0_22: ## %bb33 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx ; CHECK-NEXT: LBB0_23: ## %bb33 -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: sarl %eax ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl $128 -; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: pushl %edx ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: LBB0_25: ## %return @@ -523,38 +526,38 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB1_10 Depth 2 -; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: andl $1, %ebx ; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill -; CHECK-NEXT: addl %edx, %ebx -; CHECK-NEXT: imull {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB1_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%eax,%esi) -; CHECK-NEXT: movzbl (%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%ecx,%esi) +; CHECK-NEXT: movzbl -2(%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%eax,%esi) +; CHECK-NEXT: movzbl (%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%ecx,%esi) ; CHECK-NEXT: incl %esi ; CHECK-NEXT: cmpl %ebp, %esi ; CHECK-NEXT: jb LBB1_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB1_9 Depth=1 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: incl %ebx -; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: incl %edx +; CHECK-NEXT: addl %ebp, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl $2, %esi ; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: jb LBB1_9 ; CHECK-NEXT: LBB1_13: ## %bb20 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll index 420f5ba5ab433..31a7f1125150b 100644 --- a/llvm/test/CodeGen/X86/parity.ll +++ b/llvm/test/CodeGen/X86/parity.ll @@ -219,12 +219,12 @@ define i64 @parity_64(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -264,12 +264,12 @@ define i32 @parity_64_trunc(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64_trunc: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -628,12 +628,12 @@ define i64 @parity_64_shift(i64 %0) { ; ; X64-NOPOPCNT-LABEL: parity_64_shift: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll index 21b25d87796a5..ffdb68c7a6c01 100644 --- a/llvm/test/CodeGen/X86/pr166744.ll +++ b/llvm/test/CodeGen/X86/pr166744.ll @@ -31,13 +31,13 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) { ; NOPOSTRA-LABEL: PR166744: ; NOPOSTRA: # %bb.0: ; NOPOSTRA-NEXT: movl %esi, %eax -; NOPOSTRA-NEXT: shrl $3, %eax -; NOPOSTRA-NEXT: andl $60, %eax -; NOPOSTRA-NEXT: movl (%rdi,%rax), %ecx -; NOPOSTRA-NEXT: btrl %esi, %ecx -; NOPOSTRA-NEXT: shlxl %esi, %edx, %edx -; NOPOSTRA-NEXT: orl %ecx, %edx -; NOPOSTRA-NEXT: movl %edx, (%rdi,%rax) +; NOPOSTRA-NEXT: shrl $3, %esi +; NOPOSTRA-NEXT: andl $60, %esi +; NOPOSTRA-NEXT: movl (%rdi,%rsi), %ecx +; NOPOSTRA-NEXT: btrl %eax, %ecx +; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax +; NOPOSTRA-NEXT: orl %ecx, %eax +; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi) ; NOPOSTRA-NEXT: movq 16(%rdi), %rax ; NOPOSTRA-NEXT: movq (%rdi), %rcx ; NOPOSTRA-NEXT: movq 8(%rdi), %rdx diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll index 8f046a4f5aea5..26e68861cf45c 100644 --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -203,10 +203,10 @@ define i16 @no_extract_mul(i16 %i) nounwind { ; X64-LABEL: no_extract_mul: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $8, %edi ; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: leal (%rax,%rax,8), %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: shrl $9, %eax ; X64-NEXT: orl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index ce56283df6010..8cb032776114b 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movslq %esi, %rax -; X64-NEXT: movslq %edi, %rcx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movslq %esi, %rcx +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll index e5ea911d4771a..a93be22bf5861 100644 --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -15,16 +15,16 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movswl %di, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %esi ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testw %di, %di +; X64-NEXT: testw %dx, %dx ; X64-NEXT: sets %al ; X64-NEXT: addl $32767, %eax # imm = 0x7FFF -; X64-NEXT: cmpw %si, %di -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; @@ -33,17 +33,17 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movswl %dx, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %al ; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx -; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -58,18 +58,18 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax -; X64-NEXT: xorl %edx, %edx +; X64-NEXT: movl %eax, %edx +; X64-NEXT: xorl %esi, %esi ; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %dl -; X64-NEXT: addl $32767, %edx # imm = 0x7FFF -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movswl %si, %edi +; X64-NEXT: sets %sil +; X64-NEXT: addl $32767, %esi # imm = 0x7FFF +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movswl %ax, %edi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %edi -; X64-NEXT: cmpw %di, %ax -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movswl %si, %eax +; X64-NEXT: cmpw %di, %dx +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: cwtl ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index 10dee14bdd1a0..ff76707bdbb69 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -365,119 +365,118 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movswl %bx, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movswl %di, %ebp ; X86-NEXT: sarl %cl, %ebp ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %di, %di +; X86-NEXT: testw %bx, %bx ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %bp, %di -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovel %ebx, %ecx +; X86-NEXT: cmpw %bp, %bx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: cmovel %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movswl %di, %ebx -; X86-NEXT: sarl %cl, %ebx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %bx, %si ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi ; X86-NEXT: shll %cl, %esi ; X86-NEXT: movswl %si, %edi ; X86-NEXT: sarl %cl, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %bx, %bx +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: cmpw %di, %bx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx +; X86-NEXT: cmovel %esi, %ebp ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movswl %dx, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %di, %di ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %di +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edx +; X86-NEXT: sarl %cl, %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmpw %dx, %si +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: cmovel %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi -; X86-NEXT: sarl %cl, %esi +; X86-NEXT: movswl %dx, %eax +; X86-NEXT: sarl %cl, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmpw %ax, %si ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovel %edx, %ecx -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %dx, %dx ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: cmovel %edx, %ebx +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: cmovel %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %dl ; X86-NEXT: addl $32767, %edx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %ecx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movw %cx, 14(%eax) ; X86-NEXT: movw %dx, 12(%eax) ; X86-NEXT: movw %bx, 10(%eax) -; X86-NEXT: movw %bp, 8(%eax) ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 6(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movw %cx, 2(%eax) +; X86-NEXT: movw %bp, 2(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $16, %esp diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll index 72406aaa4efa8..9bf88cb8bdf81 100644 --- a/llvm/test/CodeGen/X86/stackmap.ll +++ b/llvm/test/CodeGen/X86/stackmap.ll @@ -1,7 +1,10 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -terminal-rule=0 | FileCheck %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. +; FIXME: Test should be fixed to produce the correct sized spill with +; -terminal-rule=0 flag removed + ; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps ; CHECK-NEXT: __LLVM_StackMaps: ; Header @@ -546,8 +549,8 @@ define void @clobberScratch(i32 %a) { ret void } -; A stack frame which needs to be realigned at runtime (to meet alignment -; criteria for values on the stack) does not have a fixed frame size. +; A stack frame which needs to be realigned at runtime (to meet alignment +; criteria for values on the stack) does not have a fixed frame size. ; CHECK-LABEL: .long L{{.*}}-_needsStackRealignment ; CHECK-NEXT: .short 0 ; 0 locations diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 5bd624c0697a0..01fbafb18eb9f 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -2429,126 +2429,126 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE2-ONLY: # %bb.0: ; SSE2-ONLY-NEXT: movl (%rdi), %eax ; SSE2-ONLY-NEXT: notl %eax -; SSE2-ONLY-NEXT: movw %ax, (%rsi) ; SSE2-ONLY-NEXT: movl %eax, %ecx -; SSE2-ONLY-NEXT: shrl $16, %ecx -; SSE2-ONLY-NEXT: movb %cl, 2(%rsi) -; SSE2-ONLY-NEXT: movb %cl, 2(%rdx) -; SSE2-ONLY-NEXT: movw %ax, (%rdx) -; SSE2-ONLY-NEXT: movb %cl, 6(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 10(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 8(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 14(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 18(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 16(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 22(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 26(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 24(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 30(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 34(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 32(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 38(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 42(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 40(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 46(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 50(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 48(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 54(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 58(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 56(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 62(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) +; SSE2-ONLY-NEXT: movw %ax, (%rsi) +; SSE2-ONLY-NEXT: shrl $16, %eax +; SSE2-ONLY-NEXT: movb %al, 2(%rsi) +; SSE2-ONLY-NEXT: movb %al, 2(%rdx) +; SSE2-ONLY-NEXT: movw %cx, (%rdx) +; SSE2-ONLY-NEXT: movb %al, 6(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 4(%rdx) +; SSE2-ONLY-NEXT: movb %al, 10(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 8(%rdx) +; SSE2-ONLY-NEXT: movb %al, 14(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 12(%rdx) +; SSE2-ONLY-NEXT: movb %al, 18(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 16(%rdx) +; SSE2-ONLY-NEXT: movb %al, 22(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 20(%rdx) +; SSE2-ONLY-NEXT: movb %al, 26(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 24(%rdx) +; SSE2-ONLY-NEXT: movb %al, 30(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 28(%rdx) +; SSE2-ONLY-NEXT: movb %al, 34(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 32(%rdx) +; SSE2-ONLY-NEXT: movb %al, 38(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 36(%rdx) +; SSE2-ONLY-NEXT: movb %al, 42(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 40(%rdx) +; SSE2-ONLY-NEXT: movb %al, 46(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 44(%rdx) +; SSE2-ONLY-NEXT: movb %al, 50(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 48(%rdx) +; SSE2-ONLY-NEXT: movb %al, 54(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 52(%rdx) +; SSE2-ONLY-NEXT: movb %al, 58(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 56(%rdx) +; SSE2-ONLY-NEXT: movb %al, 62(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 60(%rdx) ; SSE2-ONLY-NEXT: retq ; ; SSE3-LABEL: vec384_v3i8: ; SSE3: # %bb.0: ; SSE3-NEXT: movl (%rdi), %eax ; SSE3-NEXT: notl %eax -; SSE3-NEXT: movw %ax, (%rsi) ; SSE3-NEXT: movl %eax, %ecx -; SSE3-NEXT: shrl $16, %ecx -; SSE3-NEXT: movb %cl, 2(%rsi) -; SSE3-NEXT: movb %cl, 2(%rdx) -; SSE3-NEXT: movw %ax, (%rdx) -; SSE3-NEXT: movb %cl, 6(%rdx) -; SSE3-NEXT: movw %ax, 4(%rdx) -; SSE3-NEXT: movb %cl, 10(%rdx) -; SSE3-NEXT: movw %ax, 8(%rdx) -; SSE3-NEXT: movb %cl, 14(%rdx) -; SSE3-NEXT: movw %ax, 12(%rdx) -; SSE3-NEXT: movb %cl, 18(%rdx) -; SSE3-NEXT: movw %ax, 16(%rdx) -; SSE3-NEXT: movb %cl, 22(%rdx) -; SSE3-NEXT: movw %ax, 20(%rdx) -; SSE3-NEXT: movb %cl, 26(%rdx) -; SSE3-NEXT: movw %ax, 24(%rdx) -; SSE3-NEXT: movb %cl, 30(%rdx) -; SSE3-NEXT: movw %ax, 28(%rdx) -; SSE3-NEXT: movb %cl, 34(%rdx) -; SSE3-NEXT: movw %ax, 32(%rdx) -; SSE3-NEXT: movb %cl, 38(%rdx) -; SSE3-NEXT: movw %ax, 36(%rdx) -; SSE3-NEXT: movb %cl, 42(%rdx) -; SSE3-NEXT: movw %ax, 40(%rdx) -; SSE3-NEXT: movb %cl, 46(%rdx) -; SSE3-NEXT: movw %ax, 44(%rdx) -; SSE3-NEXT: movb %cl, 50(%rdx) -; SSE3-NEXT: movw %ax, 48(%rdx) -; SSE3-NEXT: movb %cl, 54(%rdx) -; SSE3-NEXT: movw %ax, 52(%rdx) -; SSE3-NEXT: movb %cl, 58(%rdx) -; SSE3-NEXT: movw %ax, 56(%rdx) -; SSE3-NEXT: movb %cl, 62(%rdx) -; SSE3-NEXT: movw %ax, 60(%rdx) +; SSE3-NEXT: movw %ax, (%rsi) +; SSE3-NEXT: shrl $16, %eax +; SSE3-NEXT: movb %al, 2(%rsi) +; SSE3-NEXT: movb %al, 2(%rdx) +; SSE3-NEXT: movw %cx, (%rdx) +; SSE3-NEXT: movb %al, 6(%rdx) +; SSE3-NEXT: movw %cx, 4(%rdx) +; SSE3-NEXT: movb %al, 10(%rdx) +; SSE3-NEXT: movw %cx, 8(%rdx) +; SSE3-NEXT: movb %al, 14(%rdx) +; SSE3-NEXT: movw %cx, 12(%rdx) +; SSE3-NEXT: movb %al, 18(%rdx) +; SSE3-NEXT: movw %cx, 16(%rdx) +; SSE3-NEXT: movb %al, 22(%rdx) +; SSE3-NEXT: movw %cx, 20(%rdx) +; SSE3-NEXT: movb %al, 26(%rdx) +; SSE3-NEXT: movw %cx, 24(%rdx) +; SSE3-NEXT: movb %al, 30(%rdx) +; SSE3-NEXT: movw %cx, 28(%rdx) +; SSE3-NEXT: movb %al, 34(%rdx) +; SSE3-NEXT: movw %cx, 32(%rdx) +; SSE3-NEXT: movb %al, 38(%rdx) +; SSE3-NEXT: movw %cx, 36(%rdx) +; SSE3-NEXT: movb %al, 42(%rdx) +; SSE3-NEXT: movw %cx, 40(%rdx) +; SSE3-NEXT: movb %al, 46(%rdx) +; SSE3-NEXT: movw %cx, 44(%rdx) +; SSE3-NEXT: movb %al, 50(%rdx) +; SSE3-NEXT: movw %cx, 48(%rdx) +; SSE3-NEXT: movb %al, 54(%rdx) +; SSE3-NEXT: movw %cx, 52(%rdx) +; SSE3-NEXT: movb %al, 58(%rdx) +; SSE3-NEXT: movw %cx, 56(%rdx) +; SSE3-NEXT: movb %al, 62(%rdx) +; SSE3-NEXT: movw %cx, 60(%rdx) ; SSE3-NEXT: retq ; ; SSSE3-ONLY-LABEL: vec384_v3i8: ; SSSE3-ONLY: # %bb.0: ; SSSE3-ONLY-NEXT: movl (%rdi), %eax ; SSSE3-ONLY-NEXT: notl %eax -; SSSE3-ONLY-NEXT: movw %ax, (%rsi) ; SSSE3-ONLY-NEXT: movl %eax, %ecx -; SSSE3-ONLY-NEXT: shrl $16, %ecx -; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi) -; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, (%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, (%rsi) +; SSSE3-ONLY-NEXT: shrl $16, %eax +; SSSE3-ONLY-NEXT: movb %al, 2(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 2(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, (%rdx) +; SSSE3-ONLY-NEXT: movb %al, 6(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 4(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 10(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 8(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 14(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 12(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 18(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 16(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 22(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 20(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 26(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 24(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 30(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 28(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 34(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 32(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 38(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 36(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 42(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 40(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 46(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 44(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 50(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 48(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 54(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 52(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 58(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 56(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 62(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 60(%rdx) ; SSSE3-ONLY-NEXT: retq ; ; SSE41-LABEL: vec384_v3i8: diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll index f20b777531c5a..3ad3e9a0e7655 100644 --- a/llvm/test/CodeGen/X86/twoaddr-lea.ll +++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll @@ -65,10 +65,10 @@ entry: define void @ham() { ; CHECK-LABEL: ham: ; CHECK: ## %bb.0: ## %bb +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movq _global@GOTPCREL(%rip), %rdx ; CHECK-NEXT: movq _global2@GOTPCREL(%rip), %rsi -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: je LBB3_2 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index eacc714b49a4d..5a68484596a2f 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll index e0e1ef7108d0d..9768e4761f47a 100644 --- a/llvm/test/CodeGen/X86/ushl_sat.ll +++ b/llvm/test/CodeGen/X86/ushl_sat.ll @@ -14,23 +14,23 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %eax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movzwl %di, %edx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %eax -; X64-NEXT: cmpw %ax, %di +; X64-NEXT: shrl %cl, %edx +; X64-NEXT: cmpw %dx, %ax ; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X86-LABEL: func: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movzwl %dx, %esi ; X86-NEXT: shrl %cl, %esi @@ -51,14 +51,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax ; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %esi +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movzwl %ax, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %esi -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax -; X64-NEXT: cwtl +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X64-NEXT: cmovel %eax, %ecx +; X64-NEXT: movswl %cx, %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index b8e83da9cf361..762088cfb2935 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -300,95 +300,94 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %si +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %dx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $65535, %esi # imm = 0xFFFF -; X86-NEXT: cmovnel %esi, %eax +; X86-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %bp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movzwl %bp, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %si -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bp, %eax +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: cmpw %ax, %di +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %eax, %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bx, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $65535, %esi # imm = 0xFFFF ; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movzwl %bp, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: shll %cl, %edi -; X86-NEXT: movzwl %di, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax ; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl $65535, %ebx # imm = 0xFFFF ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movzwl %si, %eax +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzwl %dx, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: cmpw %ax, %si +; X86-NEXT: cmovnel %ebx, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %bx ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) -; X86-NEXT: movw %si, 12(%ecx) +; X86-NEXT: movw %dx, 12(%ecx) ; X86-NEXT: movw %di, 10(%ecx) -; X86-NEXT: movw %bx, 8(%ecx) -; X86-NEXT: movw %bp, 6(%ecx) +; X86-NEXT: movw %bp, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 6(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 2(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll index b233855029c58..324fe12de9400 100644 --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -85,14 +85,14 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) { ; CHECK-NEXT: movswl %dx, %edx ; CHECK-NEXT: leal (,%rdx,4), %esi ; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: shrl $16, %esi +; CHECK-NEXT: shldw $1, %di, %si ; CHECK-NEXT: sarl $14, %edx ; CHECK-NEXT: cmpl $16384, %edx # imm = 0x4000 -; CHECK-NEXT: cmovgel %eax, %edi +; CHECK-NEXT: cmovgel %eax, %esi ; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000 -; CHECK-NEXT: cmovll %ecx, %edi -; CHECK-NEXT: pinsrw $3, %edi, %xmm1 +; CHECK-NEXT: cmovll %ecx, %esi +; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> , <4 x i16> %a, i32 15) @@ -106,19 +106,19 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $2, %xmm0, %eax ; CHECK-NEXT: leal (%rax,%rax,2), %eax ; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: shldw $1, %ax, %cx -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shldw $1, %dx, %cx +; CHECK-NEXT: cmpl $32768, %eax # imm = 0x8000 ; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF ; CHECK-NEXT: cmovael %eax, %ecx ; CHECK-NEXT: pextrw $1, %xmm0, %edx ; CHECK-NEXT: addl %edx, %edx ; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shrl $16, %esi -; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shldw $1, %dx, %di -; CHECK-NEXT: cmpl $32768, %esi # imm = 0x8000 +; CHECK-NEXT: shrl $16, %edx +; CHECK-NEXT: movl %edx, %edi +; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %edi ; CHECK-NEXT: movd %xmm0, %edx ; CHECK-NEXT: xorl %esi, %esi @@ -133,10 +133,10 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $3, %xmm0, %ecx ; CHECK-NEXT: shll $2, %ecx ; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shldw $1, %cx, %si -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl %ecx, %esi +; CHECK-NEXT: shldw $1, %dx, %si +; CHECK-NEXT: cmpl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %esi ; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 320dce840ea57..6cb43234d713b 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -397,8 +397,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -409,8 +409,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -421,8 +421,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vpmovw2m %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -722,8 +722,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -734,8 +734,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -746,8 +746,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -974,13 +974,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -990,13 +990,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovb2m %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1211,8 +1211,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1222,8 +1222,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, zeroinitializer @@ -1427,8 +1427,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1439,8 +1439,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1450,8 +1450,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vptestnmw %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1756,8 +1756,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1767,8 +1767,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1778,8 +1778,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2010,13 +2010,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2025,13 +2025,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2240,8 +2240,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2251,8 +2251,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512VL-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, %1 @@ -2504,8 +2504,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2517,8 +2517,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2528,8 +2528,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2845,8 +2845,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2856,8 +2856,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2867,8 +2867,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512VL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -3097,13 +3097,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3112,13 +3112,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 3c98eba69ae5b..1c3d27fac4203 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -777,31 +777,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, (%esp) +; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: andb $12, %bl -; FALLBACK18-NEXT: movzbl %bl, %esi -; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi -; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx -; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: orl %ebp, %ecx -; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %ebp, %edi -; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx -; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi -; FALLBACK18-NEXT: shrxl %eax, %esi, %eax -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %edx -; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: movzbl %bl, %edi +; FALLBACK18-NEXT: movl 4(%esp,%edi), %ebx +; FALLBACK18-NEXT: movl 8(%esp,%edi), %esi +; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebp +; FALLBACK18-NEXT: notb %al +; FALLBACK18-NEXT: leal (%esi,%esi), %edx +; FALLBACK18-NEXT: shlxl %eax, %edx, %edx +; FALLBACK18-NEXT: orl %ebp, %edx +; FALLBACK18-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; FALLBACK18-NEXT: addl %ebx, %ebx +; FALLBACK18-NEXT: shlxl %eax, %ebx, %ebx +; FALLBACK18-NEXT: orl %ebp, %ebx +; FALLBACK18-NEXT: movl 12(%esp,%edi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ebp +; FALLBACK18-NEXT: shlxl %eax, %ebp, %eax +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK18-NEXT: movl %eax, 12(%esi) -; FALLBACK18-NEXT: movl %edx, 8(%esi) -; FALLBACK18-NEXT: movl %edi, (%esi) -; FALLBACK18-NEXT: movl %ecx, 4(%esi) +; FALLBACK18-NEXT: movl %ecx, 12(%esi) +; FALLBACK18-NEXT: movl %eax, 8(%esi) +; FALLBACK18-NEXT: movl %ebx, (%esi) +; FALLBACK18-NEXT: movl %edx, 4(%esi) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -962,42 +962,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $44, %esp +; FALLBACK22-NEXT: subl $60, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, (%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: notb %cl -; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK22-NEXT: orl %ebx, %edx -; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebx, %edi -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: andb $12, %dl +; FALLBACK22-NEXT: movzbl %dl, %edi +; FALLBACK22-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK22-NEXT: notb %al +; FALLBACK22-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %eax, %edx, %edx +; FALLBACK22-NEXT: orl %ebp, %edx +; FALLBACK22-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK22-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK22-NEXT: shlxl %eax, %edi, %edi +; FALLBACK22-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, 12(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) +; FALLBACK22-NEXT: movl %ecx, 12(%esi) +; FALLBACK22-NEXT: movl %eax, 4(%esi) ; FALLBACK22-NEXT: movl %edi, 8(%esi) ; FALLBACK22-NEXT: movl %edx, (%esi) -; FALLBACK22-NEXT: addl $44, %esp +; FALLBACK22-NEXT: addl $60, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx @@ -1152,42 +1153,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $44, %esp +; FALLBACK26-NEXT: subl $60, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: notb %cl -; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK26-NEXT: orl %ebx, %edx -; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebx, %edi -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: andb $12, %dl +; FALLBACK26-NEXT: movzbl %dl, %edi +; FALLBACK26-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %eax, %edx, %edx +; FALLBACK26-NEXT: orl %ebp, %edx +; FALLBACK26-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK26-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK26-NEXT: shlxl %eax, %edi, %edi +; FALLBACK26-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, 12(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) +; FALLBACK26-NEXT: movl %ecx, 12(%esi) +; FALLBACK26-NEXT: movl %eax, 4(%esi) ; FALLBACK26-NEXT: movl %edi, 8(%esi) ; FALLBACK26-NEXT: movl %edx, (%esi) -; FALLBACK26-NEXT: addl $44, %esp +; FALLBACK26-NEXT: addl $60, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx @@ -1342,42 +1344,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $44, %esp +; FALLBACK30-NEXT: subl $60, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: notb %cl -; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK30-NEXT: orl %ebx, %edx -; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebx, %edi -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: andb $12, %dl +; FALLBACK30-NEXT: movzbl %dl, %edi +; FALLBACK30-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %eax, %edx, %edx +; FALLBACK30-NEXT: orl %ebp, %edx +; FALLBACK30-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK30-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK30-NEXT: shlxl %eax, %edi, %edi +; FALLBACK30-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, 12(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) +; FALLBACK30-NEXT: movl %ecx, 12(%esi) +; FALLBACK30-NEXT: movl %eax, 4(%esi) ; FALLBACK30-NEXT: movl %edi, 8(%esi) ; FALLBACK30-NEXT: movl %edx, (%esi) -; FALLBACK30-NEXT: addl $44, %esp +; FALLBACK30-NEXT: addl $60, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx @@ -1784,41 +1787,41 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl 4(%ecx), %esi ; FALLBACK18-NEXT: movl 8(%ecx), %edi ; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %eax -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: shlb $3, %bl +; FALLBACK18-NEXT: movzbl (%eax), %ebx +; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: shlb $3, %al ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, (%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $12, %al -; FALLBACK18-NEXT: negb %al -; FALLBACK18-NEXT: movsbl %al, %edx -; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi -; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: movl %eax, %ecx +; FALLBACK18-NEXT: andb $12, %bl +; FALLBACK18-NEXT: negb %bl +; FALLBACK18-NEXT: movsbl %bl, %esi +; FALLBACK18-NEXT: movl 16(%esp,%esi), %ebx +; FALLBACK18-NEXT: movl 20(%esp,%esi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edi ; FALLBACK18-NEXT: notb %al -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %eax, %edi, %edi -; FALLBACK18-NEXT: orl %esi, %edi -; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx -; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK18-NEXT: shrl %ebx +; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; FALLBACK18-NEXT: movl 24(%esp,%esi), %esi +; FALLBACK18-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %eax, %esi, %esi +; FALLBACK18-NEXT: orl %edi, %esi ; FALLBACK18-NEXT: shrl %edx -; FALLBACK18-NEXT: shrxl %eax, %edx, %edx -; FALLBACK18-NEXT: orl %esi, %edx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax -; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: shrxl %eax, %edx, %eax +; FALLBACK18-NEXT: orl %ecx, %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK18-NEXT: movl %ebp, (%ecx) ; FALLBACK18-NEXT: movl %eax, 8(%ecx) -; FALLBACK18-NEXT: movl %edx, 12(%ecx) -; FALLBACK18-NEXT: movl %edi, 4(%ecx) +; FALLBACK18-NEXT: movl %esi, 12(%ecx) +; FALLBACK18-NEXT: movl %ebx, 4(%ecx) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -1983,39 +1986,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, (%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %ecx -; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %esi, %edx -; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %esi, %ebp +; FALLBACK22-NEXT: movl %eax, %ecx +; FALLBACK22-NEXT: andb $12, %dl +; FALLBACK22-NEXT: negb %dl +; FALLBACK22-NEXT: movsbl %dl, %edx +; FALLBACK22-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK22-NEXT: notb %al +; FALLBACK22-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %eax, %esi, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK22-NEXT: movl %edi, %ebp ; FALLBACK22-NEXT: shrl %ebp -; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, (%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) -; FALLBACK22-NEXT: movl %edx, 12(%esi) +; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK22-NEXT: orl %ebx, %ebp +; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK22-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK22-NEXT: shrl %edx +; FALLBACK22-NEXT: shrxl %eax, %edx, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, (%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %ebp, 8(%edx) +; FALLBACK22-NEXT: movl %esi, 12(%edx) ; FALLBACK22-NEXT: addl $44, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -2175,39 +2178,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %ecx -; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %edi -; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %esi, %edx -; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %esi, %ebp +; FALLBACK26-NEXT: movl %eax, %ecx +; FALLBACK26-NEXT: andb $12, %dl +; FALLBACK26-NEXT: negb %dl +; FALLBACK26-NEXT: movsbl %dl, %edx +; FALLBACK26-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %eax, %esi, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK26-NEXT: movl %edi, %ebp ; FALLBACK26-NEXT: shrl %ebp -; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK26-NEXT: orl %edi, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, (%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) -; FALLBACK26-NEXT: movl %edx, 12(%esi) +; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK26-NEXT: orl %ebx, %ebp +; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK26-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK26-NEXT: shrl %edx +; FALLBACK26-NEXT: shrxl %eax, %edx, %eax +; FALLBACK26-NEXT: orl %edi, %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %ecx, (%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %ebp, 8(%edx) +; FALLBACK26-NEXT: movl %esi, 12(%edx) ; FALLBACK26-NEXT: addl $44, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -2367,39 +2370,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %ecx -; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %edi -; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %esi, %edx -; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %esi, %ebp +; FALLBACK30-NEXT: movl %eax, %ecx +; FALLBACK30-NEXT: andb $12, %dl +; FALLBACK30-NEXT: negb %dl +; FALLBACK30-NEXT: movsbl %dl, %edx +; FALLBACK30-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %eax, %esi, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK30-NEXT: movl %edi, %ebp ; FALLBACK30-NEXT: shrl %ebp -; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK30-NEXT: orl %edi, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, (%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) -; FALLBACK30-NEXT: movl %edx, 12(%esi) +; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK30-NEXT: orl %ebx, %ebp +; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK30-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK30-NEXT: shrl %edx +; FALLBACK30-NEXT: shrxl %eax, %edx, %eax +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, (%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %ebp, 8(%edx) +; FALLBACK30-NEXT: movl %esi, 12(%edx) ; FALLBACK30-NEXT: addl $44, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -2833,31 +2836,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl -; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl -; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%edi), %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%esi,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%edi), %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %edi, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%esi) ; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi @@ -3208,30 +3211,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes: @@ -3355,30 +3358,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %esi ; FALLBACK6-NEXT: andb $24, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes: @@ -3487,35 +3490,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-LABEL: lshr_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: movzbl (%rsi), %eax +; FALLBACK10-NEXT: leal (,%rax,8), %ecx ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movl %ecx, %esi +; FALLBACK10-NEXT: andb $24, %al +; FALLBACK10-NEXT: movzbl %al, %eax +; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rax), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %rdi, %rcx +; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 24(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -3623,35 +3626,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-LABEL: lshr_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: movzbl (%rsi), %eax +; FALLBACK14-NEXT: leal (,%rax,8), %ecx ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movl %ecx, %esi +; FALLBACK14-NEXT: andb $24, %al +; FALLBACK14-NEXT: movzbl %al, %eax +; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rax), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %rdi, %rcx +; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 24(%rdx) +; FALLBACK14-NEXT: movq %rcx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -3914,81 +3917,75 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl %ebx, %ecx +; FALLBACK18-NEXT: shlb $3, %cl ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, %eax ; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: movzbl %bl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %esi, %edx +; FALLBACK18-NEXT: movzbl %bl, %esi +; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx +; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: orl %edi, %edx ; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %ebx, %ecx -; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx +; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %esi, %esi +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK18-NEXT: movl %esi, 28(%edi) +; FALLBACK18-NEXT: movl %ecx, 24(%edi) +; FALLBACK18-NEXT: movl %eax, 16(%edi) +; FALLBACK18-NEXT: movl %edx, 20(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edi) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -4261,72 +4258,70 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlb $3, %dl +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: shlb $3, %cl ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %edx, %eax -; FALLBACK22-NEXT: notb %al -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl %eax, %ebp -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: movzbl %dl, %ebx +; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK22-NEXT: movl %eax, %edi +; FALLBACK22-NEXT: notb %cl +; FALLBACK22-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK22-NEXT: movl %ebp, %ecx -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK22-NEXT: orl %edx, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK22-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK22-NEXT: shrxl %edi, %edx, %esi +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK22-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edi, %edx, %esi +; FALLBACK22-NEXT: movl %edi, %edx +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebp +; FALLBACK22-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK22-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK22-NEXT: shrxl %edi, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebx ; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebp, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: orl %eax, %ebx ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, 28(%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %ebx, 24(%edx) +; FALLBACK22-NEXT: movl %edi, 16(%edx) +; FALLBACK22-NEXT: movl %ebp, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4585,70 +4580,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlb $3, %dl +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %ecx +; FALLBACK26-NEXT: shlb $3, %cl ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %edx, %eax -; FALLBACK26-NEXT: notb %al -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl %eax, %ebp -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: movzbl %dl, %ebx +; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK26-NEXT: movl %eax, %edi +; FALLBACK26-NEXT: notb %cl +; FALLBACK26-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK26-NEXT: movl %ebp, %ecx -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK26-NEXT: orl %edx, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK26-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK26-NEXT: shrxl %edi, %edx, %esi +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK26-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edi, %edx, %esi +; FALLBACK26-NEXT: movl %edi, %edx +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebp +; FALLBACK26-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK26-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK26-NEXT: shrxl %edi, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebx ; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebp, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) +; FALLBACK26-NEXT: orl %eax, %ebx ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %ecx, 28(%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %ebx, 24(%edx) +; FALLBACK26-NEXT: movl %edi, 16(%edx) +; FALLBACK26-NEXT: movl %ebp, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4906,70 +4899,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlb $3, %dl +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %ecx +; FALLBACK30-NEXT: shlb $3, %cl ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %edx, %eax -; FALLBACK30-NEXT: notb %al -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl %eax, %ebp -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: movzbl %dl, %ebx +; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK30-NEXT: movl %eax, %edi +; FALLBACK30-NEXT: notb %cl +; FALLBACK30-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK30-NEXT: movl %ebp, %ecx -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK30-NEXT: orl %edx, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK30-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK30-NEXT: shrxl %edi, %edx, %esi +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK30-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edi, %edx, %esi +; FALLBACK30-NEXT: movl %edi, %edx +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebp +; FALLBACK30-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK30-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK30-NEXT: shrxl %edi, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebx ; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebp, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: orl %eax, %ebx ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, 28(%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %ebx, 24(%edx) +; FALLBACK30-NEXT: movl %edi, 16(%edx) +; FALLBACK30-NEXT: movl %ebp, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5157,30 +5148,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes_dwordOff: @@ -5307,30 +5298,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %esi ; FALLBACK6-NEXT: andb $6, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx,4), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes_dwordOff: @@ -5441,36 +5432,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK10-LABEL: lshr_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax -; FALLBACK10-NEXT: shlb $5, %al +; FALLBACK10-NEXT: movzbl (%rsi), %eax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: shlb $5, %cl ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $6, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movl %ecx, %esi +; FALLBACK10-NEXT: andb $6, %al +; FALLBACK10-NEXT: movzbl %al, %eax +; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rax,4), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %rdi, %rcx +; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 24(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -5580,36 +5571,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK14-LABEL: lshr_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax -; FALLBACK14-NEXT: shlb $5, %al +; FALLBACK14-NEXT: movzbl (%rsi), %eax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: shlb $5, %cl ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $6, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movl %ecx, %esi +; FALLBACK14-NEXT: andb $6, %al +; FALLBACK14-NEXT: movzbl %al, %eax +; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rax,4), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %rdi, %rcx +; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 24(%rdx) +; FALLBACK14-NEXT: movq %rcx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -6025,31 +6016,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movsbq %sil, %rdi +; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 +; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes: @@ -6167,38 +6158,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: leal (,%rcx,8), %eax +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: leal (,%rsi,8), %eax ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: andb $24, %sil +; FALLBACK6-NEXT: negb %sil +; FALLBACK6-NEXT: movsbq %sil, %rsi +; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK6-NEXT: shrq %rsi +; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK6-NEXT: orq %r9, %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: movq %rsi, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes: @@ -6308,36 +6299,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-LABEL: shl_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andb $24, %sil +; FALLBACK10-NEXT: negb %sil +; FALLBACK10-NEXT: movsbq %sil, %rsi +; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK10-NEXT: shrq %rsi +; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK10-NEXT: orq %r9, %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: movq %rsi, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -6446,36 +6437,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-LABEL: shl_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: leal (,%rsi,8), %eax ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: andb $24, %sil +; FALLBACK14-NEXT: negb %sil +; FALLBACK14-NEXT: movsbq %sil, %rsi +; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK14-NEXT: shrq %rsi +; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK14-NEXT: orq %r9, %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: movq %rsi, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -6745,71 +6736,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, %eax +; FALLBACK18-NEXT: movl %eax, %ebp ; FALLBACK18-NEXT: andb $28, %bl ; FALLBACK18-NEXT: negb %bl ; FALLBACK18-NEXT: movsbl %bl, %esi ; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %eax, %edi -; FALLBACK18-NEXT: movl %edx, %ecx -; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: movl 68(%esp,%esi), %ecx +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %ecx, %edi +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: shrxl %edx, %ebx, %ebx ; FALLBACK18-NEXT: orl %edi, %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, %edi ; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax +; FALLBACK18-NEXT: shrxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp +; FALLBACK18-NEXT: movl %ebp, %esi +; FALLBACK18-NEXT: shlxl %ebp, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebx +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %edx, %ecx, %ecx +; FALLBACK18-NEXT: orl %ebx, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK18-NEXT: movl 80(%esp,%ebp), %ecx +; FALLBACK18-NEXT: movl %ecx, %ebx ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax -; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp +; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %esi, %ecx, %ecx +; FALLBACK18-NEXT: movl %esi, %eax ; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp -; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: shrxl %edx, %edi, %edi +; FALLBACK18-NEXT: orl %ecx, %edi +; FALLBACK18-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: shlxl %esi, 92(%esp,%ecx), %ebp +; FALLBACK18-NEXT: movl 88(%esp,%ecx), %esi +; FALLBACK18-NEXT: shlxl %eax, %esi, %ecx ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK18-NEXT: shrxl %edx, %esi, %esi ; FALLBACK18-NEXT: orl %ebp, %esi ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: movl %ecx, (%edx) +; FALLBACK18-NEXT: movl %eax, 24(%edx) +; FALLBACK18-NEXT: movl %esi, 28(%edx) +; FALLBACK18-NEXT: movl %edi, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edx) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -7085,78 +7080,76 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: shlb $3, %cl ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %edx -; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, %esi -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %edi +; FALLBACK22-NEXT: movl %ecx, %ebx +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: negb %dl +; FALLBACK22-NEXT: movsbl %dl, %edx +; FALLBACK22-NEXT: movl 84(%esp,%edx), %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK22-NEXT: notb %cl +; FALLBACK22-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %esi, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK22-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK22-NEXT: movl %esi, %edi ; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: orl %ecx, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %ebp, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK22-NEXT: movl %edi, %ebp +; FALLBACK22-NEXT: shrl %ebp +; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ebp +; FALLBACK22-NEXT: orl %esi, %ebp +; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK22-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK22-NEXT: movl %ebp, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: orl %ebp, %edi ; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %edi, %edx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK22-NEXT: movl %edi, (%esi) -; FALLBACK22-NEXT: movl %edx, 28(%esi) -; FALLBACK22-NEXT: movl %eax, 24(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) +; FALLBACK22-NEXT: shrxl %ecx, %eax, %esi +; FALLBACK22-NEXT: movl 88(%esp,%edx), %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ecx, %eax, %eax +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movl %ebp, (%ecx) +; FALLBACK22-NEXT: movl %eax, 28(%ecx) +; FALLBACK22-NEXT: movl %esi, 24(%ecx) +; FALLBACK22-NEXT: movl %edi, 4(%ecx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%esi) +; FALLBACK22-NEXT: movl %eax, 12(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 16(%esi) +; FALLBACK22-NEXT: movl %eax, 16(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%esi) +; FALLBACK22-NEXT: movl %eax, 20(%ecx) ; FALLBACK22-NEXT: addl $108, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -7410,76 +7403,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %edx -; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi ; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, %esi -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %edi +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: negb %dl +; FALLBACK26-NEXT: movsbl %dl, %edx +; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %esi +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %esi, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK26-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK26-NEXT: movl %esi, %edi ; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ecx, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %ebp, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK26-NEXT: movl %edi, %ebp +; FALLBACK26-NEXT: shrl %ebp +; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK26-NEXT: orl %esi, %ebp +; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK26-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK26-NEXT: movl %ebp, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %esi +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %eax, %esi, %edi +; FALLBACK26-NEXT: orl %ebp, %edi +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %eax, %ecx, %esi +; FALLBACK26-NEXT: movl 88(%esp,%edx), %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebp +; FALLBACK26-NEXT: orl %ebp, %esi +; FALLBACK26-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %eax, %ecx, %eax +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: movl %ebp, (%ecx) +; FALLBACK26-NEXT: movl %eax, 28(%ecx) +; FALLBACK26-NEXT: movl %esi, 24(%ecx) +; FALLBACK26-NEXT: movl %edi, 4(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %edi, %edx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK26-NEXT: movl %edi, (%esi) -; FALLBACK26-NEXT: movl %edx, 28(%esi) -; FALLBACK26-NEXT: movl %eax, 24(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) +; FALLBACK26-NEXT: movl %eax, 8(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%esi) +; FALLBACK26-NEXT: movl %eax, 12(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 16(%esi) +; FALLBACK26-NEXT: movl %eax, 16(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%esi) +; FALLBACK26-NEXT: movl %eax, 20(%ecx) ; FALLBACK26-NEXT: addl $108, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -7732,76 +7723,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %edx -; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi ; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, %esi -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %edi +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: negb %dl +; FALLBACK30-NEXT: movsbl %dl, %edx +; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %esi +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %esi, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK30-NEXT: movl %esi, %edi ; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ecx, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi -; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %edi, %edx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK30-NEXT: movl %edi, (%esi) -; FALLBACK30-NEXT: movl %edx, 28(%esi) -; FALLBACK30-NEXT: movl %eax, 24(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %ebp, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK30-NEXT: movl %edi, %ebp +; FALLBACK30-NEXT: shrl %ebp +; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK30-NEXT: orl %esi, %ebp +; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK30-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK30-NEXT: movl %ebp, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %eax, %esi, %edi +; FALLBACK30-NEXT: orl %ebp, %edi +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %eax, %ecx, %esi +; FALLBACK30-NEXT: movl 88(%esp,%edx), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %eax, %ecx, %eax +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: movl %ebp, (%ecx) +; FALLBACK30-NEXT: movl %eax, 28(%ecx) +; FALLBACK30-NEXT: movl %esi, 24(%ecx) +; FALLBACK30-NEXT: movl %edi, 4(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%esi) +; FALLBACK30-NEXT: movl %eax, 12(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 16(%esi) +; FALLBACK30-NEXT: movl %eax, 16(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%esi) +; FALLBACK30-NEXT: movl %eax, 20(%ecx) ; FALLBACK30-NEXT: addl $108, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -7987,32 +7976,32 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: shlb $2, %sil ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movsbq %sil, %rdi +; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 +; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes_dwordOff: @@ -8135,40 +8124,40 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: movl %ecx, %eax +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: movl %esi, %eax ; FALLBACK6-NEXT: shlb $5, %al ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: shlb $2, %cl -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: shlb $2, %sil +; FALLBACK6-NEXT: andb $24, %sil +; FALLBACK6-NEXT: negb %sil +; FALLBACK6-NEXT: movsbq %sil, %rsi +; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK6-NEXT: shrq %rsi +; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK6-NEXT: orq %r9, %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: movq %rsi, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes_dwordOff: @@ -8283,38 +8272,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK10-LABEL: shl_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: movl %esi, %eax ; FALLBACK10-NEXT: shlb $5, %al ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: shlb $2, %cl -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: shlb $2, %sil +; FALLBACK10-NEXT: andb $24, %sil +; FALLBACK10-NEXT: negb %sil +; FALLBACK10-NEXT: movsbq %sil, %rsi +; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK10-NEXT: shrq %rsi +; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK10-NEXT: orq %r9, %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: movq %rsi, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -8428,38 +8417,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK14-LABEL: shl_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: movl %esi, %eax ; FALLBACK14-NEXT: shlb $5, %al ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: shlb $2, %cl -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: shlb $2, %sil +; FALLBACK14-NEXT: andb $24, %sil +; FALLBACK14-NEXT: negb %sil +; FALLBACK14-NEXT: movsbq %sil, %rsi +; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK14-NEXT: shrq %rsi +; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK14-NEXT: orq %r9, %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: movq %rsi, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -8906,30 +8895,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes: @@ -9067,30 +9056,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %ecx ; FALLBACK6-NEXT: andb $24, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movzbl %sil, %esi +; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes: @@ -9227,30 +9216,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movl %eax, %ecx ; FALLBACK10-NEXT: andb $24, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movzbl %sil, %esi +; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %rdi, %rax +; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes: @@ -9387,30 +9376,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andb $24, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movzbl %sil, %esi +; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %rdi, %rax +; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: movq %rcx, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes: @@ -9671,7 +9660,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK18-NEXT: movl (%esi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -9680,22 +9669,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl 8(%esi), %ebx ; FALLBACK18-NEXT: movl 12(%esi), %ebp ; FALLBACK18-NEXT: movl 16(%esi), %edi -; FALLBACK18-NEXT: movzbl (%ecx), %ecx -; FALLBACK18-NEXT: movl 20(%esi), %edx +; FALLBACK18-NEXT: movzbl (%edx), %edx +; FALLBACK18-NEXT: movl 20(%esi), %ecx ; FALLBACK18-NEXT: movl 24(%esi), %eax ; FALLBACK18-NEXT: movl 28(%esi), %esi ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl %edx, %ecx +; FALLBACK18-NEXT: shlb $3, %cl ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: sarl $31, %esi ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) @@ -9705,66 +9694,65 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $28, %cl -; FALLBACK18-NEXT: movzbl %cl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp -; FALLBACK18-NEXT: orl %ebx, %ebp -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%esi,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: andb $28, %dl +; FALLBACK18-NEXT: movzbl %dl, %esi +; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx +; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx +; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: sarxl %ebp, %esi, %esi +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK18-NEXT: movl %esi, 28(%edi) +; FALLBACK18-NEXT: movl %ecx, 24(%edi) +; FALLBACK18-NEXT: movl %eax, 16(%edi) +; FALLBACK18-NEXT: movl %edx, 20(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edi) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -10070,82 +10058,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movl 16(%ecx), %esi ; FALLBACK22-NEXT: movl 20(%ecx), %edi -; FALLBACK22-NEXT: movl 24(%ecx), %ebx -; FALLBACK22-NEXT: movl 28(%ecx), %edx -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl 24(%ecx), %ebp +; FALLBACK22-NEXT: movl 28(%ecx), %ecx +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: shlb $3, %bl +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: sarl $31, %edx -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %eax, %edx -; FALLBACK22-NEXT: notb %dl -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: sarl $31, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ebx, %eax +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: movzbl %dl, %ecx +; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK22-NEXT: movl %eax, %ebp +; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: sarxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edx +; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK22-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK22-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK22-NEXT: movl %ebp, %edx +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebp +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK22-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK22-NEXT: shrxl %edx, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %eax, %esi ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: sarxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, 28(%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %esi, 24(%edx) +; FALLBACK22-NEXT: movl %edi, 16(%edx) +; FALLBACK22-NEXT: movl %ebp, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -10446,82 +10434,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK26-NEXT: movl 16(%ecx), %esi ; FALLBACK26-NEXT: movl 20(%ecx), %edi -; FALLBACK26-NEXT: movl 24(%ecx), %ebx -; FALLBACK26-NEXT: movl 28(%ecx), %edx -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl 24(%ecx), %ebp +; FALLBACK26-NEXT: movl 28(%ecx), %ecx +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: shlb $3, %bl +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: sarl $31, %edx -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %eax, %edx -; FALLBACK26-NEXT: notb %dl -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: sarl $31, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ebx, %eax +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: movzbl %dl, %ecx +; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK26-NEXT: movl %eax, %ebp +; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: sarxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edx +; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK26-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK26-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK26-NEXT: movl %ebp, %edx +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebp +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK26-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %eax, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: sarxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl %ecx, 28(%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %esi, 24(%edx) +; FALLBACK26-NEXT: movl %edi, 16(%edx) +; FALLBACK26-NEXT: movl %ebp, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -10822,82 +10810,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK30-NEXT: movl 16(%ecx), %esi ; FALLBACK30-NEXT: movl 20(%ecx), %edi -; FALLBACK30-NEXT: movl 24(%ecx), %ebx -; FALLBACK30-NEXT: movl 28(%ecx), %edx -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl 24(%ecx), %ebp +; FALLBACK30-NEXT: movl 28(%ecx), %ecx +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: shlb $3, %bl +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: sarl $31, %edx -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %eax, %edx -; FALLBACK30-NEXT: notb %dl -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: sarl $31, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ebx, %eax +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: movzbl %dl, %ecx +; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK30-NEXT: movl %eax, %ebp +; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: sarxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edx +; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK30-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK30-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK30-NEXT: movl %ebp, %edx +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebp +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK30-NEXT: shrxl %edx, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK30-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %eax, %esi ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: sarxl %ebx, %ecx, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, 28(%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %esi, 24(%edx) +; FALLBACK30-NEXT: movl %edi, 16(%edx) +; FALLBACK30-NEXT: movl %ebp, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -11104,30 +11092,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes_dwordOff: @@ -11268,30 +11256,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %ecx ; FALLBACK6-NEXT: andb $6, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movzbl %sil, %esi +; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes_dwordOff: @@ -11431,30 +11419,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movl %eax, %ecx ; FALLBACK10-NEXT: andb $6, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movzbl %sil, %esi +; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %rdi, %rax +; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes_dwordOff: @@ -11594,30 +11582,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andb $6, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movzbl %sil, %esi +; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %rdi, %rax +; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: movq %rcx, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes_dwordOff: @@ -12204,10 +12192,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: lshr_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -12235,60 +12221,58 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: movl %ecx, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 +; FALLBACK2-NEXT: notb %cl +; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK2-NEXT: orq %r10, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 +; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 +; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 +; FALLBACK2-NEXT: orq %rbx, %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK2-NEXT: orq %rbx, %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 +; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx +; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK2-NEXT: movq %rax, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 48(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r12, 40(%rdx) +; FALLBACK2-NEXT: movq %r11, 16(%rdx) +; FALLBACK2-NEXT: movq %r9, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_64bytes: @@ -12512,13 +12496,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: lshr_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 @@ -12533,62 +12515,60 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: leal (,%rax,8), %ecx +; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: movl %ecx, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK6-NEXT: notb %cl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r8, %rdi +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK6-NEXT: addq %r11, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK6-NEXT: orq %rbx, %r11 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK6-NEXT: orq %r15, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK6-NEXT: addq %rbx, %rbx +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK6-NEXT: orq %r14, %rbx +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK6-NEXT: orq %r14, %r15 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK6-NEXT: orq %r10, %rcx +; FALLBACK6-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK6-NEXT: movq %rax, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: movq %r15, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 32(%rdx) +; FALLBACK6-NEXT: movq %r13, 40(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_64bytes: @@ -12749,43 +12729,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: pushq %rbx ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax +; FALLBACK9-NEXT: movl (%rsi), %edi ; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: leal (,%rdi,8), %ecx ; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK9-NEXT: andl $56, %edi +; FALLBACK9-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK9-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq %r9, %rax +; FALLBACK9-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK9-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK9-NEXT: movq %r10, %r8 ; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK9-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: movq %r11, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK9-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK9-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK9-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK9-NEXT: movq %rdi, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) ; FALLBACK9-NEXT: movq %r9, 48(%rdx) ; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) +; FALLBACK9-NEXT: movq %rsi, 32(%rdx) ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %rax, 24(%rdx) ; FALLBACK9-NEXT: movq %r14, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 @@ -12795,77 +12775,73 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: lshr_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: movl (%rsi), %esi ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 -; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp -; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) -; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -120(%rsp,%rsi), %r10 +; FALLBACK10-NEXT: movq -112(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %r8, %rdi +; FALLBACK10-NEXT: movq -104(%rsp,%rsi), %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r11, %rbx +; FALLBACK10-NEXT: movq -96(%rsp,%rsi), %r14 +; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK10-NEXT: shlxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rbx +; FALLBACK10-NEXT: addq %r11, %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rbx, %r11 +; FALLBACK10-NEXT: movq -88(%rsp,%rsi), %rbx +; FALLBACK10-NEXT: shrxq %rcx, %rbx, %r15 +; FALLBACK10-NEXT: movq -80(%rsp,%rsi), %r12 +; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK10-NEXT: shlxq %rax, %r13, %r13 +; FALLBACK10-NEXT: orq %r15, %r13 +; FALLBACK10-NEXT: shrxq %rcx, %r14, %r14 +; FALLBACK10-NEXT: addq %rbx, %rbx +; FALLBACK10-NEXT: shlxq %rax, %rbx, %rbx +; FALLBACK10-NEXT: orq %r14, %rbx +; FALLBACK10-NEXT: shrxq %rcx, %r12, %r14 +; FALLBACK10-NEXT: movq -72(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r15 +; FALLBACK10-NEXT: shlxq %rax, %r15, %r15 +; FALLBACK10-NEXT: orq %r14, %r15 +; FALLBACK10-NEXT: shrxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 56(%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %r15, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 32(%rdx) +; FALLBACK10-NEXT: movq %r13, 40(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -12930,45 +12906,45 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: pushq %rax ; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %r9d +; FALLBACK12-NEXT: movl (%rsi), %r10d ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%r9,8), %eax +; FALLBACK12-NEXT: leal (,%r10,8), %eax ; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %r9d -; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 +; FALLBACK12-NEXT: andl $56, %r10d +; FALLBACK12-NEXT: movq -128(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq -120(%rsp,%r10), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx +; FALLBACK12-NEXT: orq %r9, %rdi +; FALLBACK12-NEXT: movq -104(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq %r9, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 +; FALLBACK12-NEXT: movq -96(%rsp,%r10), %r12 ; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 ; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx +; FALLBACK12-NEXT: movq -112(%rsp,%r10), %rbx ; FALLBACK12-NEXT: movq %rbx, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: addq %r9, %r9 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: orq %r14, %r9 +; FALLBACK12-NEXT: movq -88(%rsp,%r10), %r14 ; FALLBACK12-NEXT: movq %r14, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp +; FALLBACK12-NEXT: movq -80(%rsp,%r10), %rbp ; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r15 @@ -12981,8 +12957,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: orq %r12, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 +; FALLBACK12-NEXT: movq -72(%rsp,%r10), %r10 +; FALLBACK12-NEXT: leaq (%r10,%r10), %r12 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: orq %rbp, %r12 @@ -12993,13 +12969,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r8, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 56(%rdx) +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movq %r10, 56(%rdx) ; FALLBACK12-NEXT: movq %rbx, 8(%rdx) ; FALLBACK12-NEXT: movq %r12, 48(%rdx) ; FALLBACK12-NEXT: movq %r14, 32(%rdx) ; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) +; FALLBACK12-NEXT: movq %r9, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: addq $8, %rsp @@ -13062,74 +13038,70 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: lshr_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK14-NEXT: movl (%rsi), %esi ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rsi,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: leal (,%rsi,8), %eax +; FALLBACK14-NEXT: andl $56, %eax +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andl $56, %esi -; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax -; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10 -; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15 -; FALLBACK14-NEXT: movl %ecx, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %r10 +; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %r8, %rdi +; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r11, %rbx +; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r14 +; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK14-NEXT: shlxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rbx +; FALLBACK14-NEXT: addq %r11, %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rbx, %r11 +; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %rbx +; FALLBACK14-NEXT: shrxq %rcx, %rbx, %r15 ; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 -; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp +; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK14-NEXT: shlxq %rax, %r13, %r13 +; FALLBACK14-NEXT: orq %r15, %r13 +; FALLBACK14-NEXT: shrxq %rcx, %r14, %r14 +; FALLBACK14-NEXT: addq %rbx, %rbx +; FALLBACK14-NEXT: shlxq %rax, %rbx, %rbx +; FALLBACK14-NEXT: orq %r14, %rbx +; FALLBACK14-NEXT: shrxq %rcx, %r12, %r14 ; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r15 +; FALLBACK14-NEXT: shlxq %rax, %r15, %r15 +; FALLBACK14-NEXT: orq %r14, %r15 +; FALLBACK14-NEXT: shrxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rsi, %rsi -; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi -; FALLBACK14-NEXT: orq %r13, %rsi -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %rbp, %rax ; FALLBACK14-NEXT: movq %rcx, 56(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rsi, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: movq %r15, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 32(%rdx) +; FALLBACK14-NEXT: movq %r13, 40(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -13139,40 +13111,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx ; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax +; FALLBACK15-NEXT: movl (%rsi), %edi ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: leal (,%rdi,8), %ecx ; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK15-NEXT: andl $56, %edi +; FALLBACK15-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK15-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq %r9, %rax +; FALLBACK15-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK15-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK15-NEXT: movq %r10, %r8 ; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK15-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: movq %r11, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK15-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK15-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK15-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK15-NEXT: movq %rdi, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) +; FALLBACK15-NEXT: movq %rsi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r14, (%rdx) ; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx @@ -13618,14 +13590,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 36(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebp -; FALLBACK18-NEXT: movl 44(%eax), %ebx +; FALLBACK18-NEXT: movl 40(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 44(%eax), %ebp ; FALLBACK18-NEXT: movl 48(%eax), %edi ; FALLBACK18-NEXT: movl 52(%eax), %esi ; FALLBACK18-NEXT: movl 56(%eax), %edx ; FALLBACK18-NEXT: movl 60(%eax), %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %eax +; FALLBACK18-NEXT: movl (%eax), %ebx ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) @@ -13634,136 +13607,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: leal (,%eax,8), %edx +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: leal (,%ebx,8), %edx ; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: andl $60, %ecx -; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %edx, %ecx +; FALLBACK18-NEXT: andl $60, %ebx +; FALLBACK18-NEXT: movl 68(%esp,%ebx), %esi +; FALLBACK18-NEXT: movl 72(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %edi -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%eax,%eax), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 76(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 88(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 92(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 104(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 100(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 112(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ecx, %ebp ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %esi, %ecx +; FALLBACK18-NEXT: orl %eax, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 116(%esp,%ebx), %eax +; FALLBACK18-NEXT: movl %ebp, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebp +; FALLBACK18-NEXT: orl %ebp, %esi +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %ecx, %ebp ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: movl 124(%esp,%ebx), %eax +; FALLBACK18-NEXT: leal (%eax,%eax), %ebx +; FALLBACK18-NEXT: shlxl %edx, %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: shrxl %ebp, %eax, %edi ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl %edi, 60(%eax) +; FALLBACK18-NEXT: movl %edx, 56(%eax) +; FALLBACK18-NEXT: movl %ecx, 48(%eax) +; FALLBACK18-NEXT: movl %esi, 52(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -14284,7 +14259,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK22-NEXT: movl (%eax), %ecx +; FALLBACK22-NEXT: movl (%eax), %ebx ; FALLBACK22-NEXT: xorps %xmm4, %xmm4 ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) @@ -14294,112 +14269,114 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%ecx,8), %edx +; FALLBACK22-NEXT: leal (,%ebx,8), %edx ; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: andl $60, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: andl $60, %ebx +; FALLBACK22-NEXT: movl 68(%esp,%ebx), %esi +; FALLBACK22-NEXT: movl 72(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %edi -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: notb %dl ; FALLBACK22-NEXT: leal (%eax,%eax), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 76(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 88(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 84(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 92(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 104(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 100(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx -; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi +; FALLBACK22-NEXT: movl 112(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ecx, %ebp ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, %esi, %ecx +; FALLBACK22-NEXT: orl %eax, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK22-NEXT: leal (%edi,%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 116(%esp,%ebx), %eax +; FALLBACK22-NEXT: movl %ebp, %ecx +; FALLBACK22-NEXT: shrxl %ebp, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %ebp ; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 124(%esp,%ebx), %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %ebx +; FALLBACK22-NEXT: shlxl %edx, %ebx, %edx +; FALLBACK22-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK22-NEXT: orl %edi, %edx +; FALLBACK22-NEXT: shrxl %ebp, %eax, %edi ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl %edi, 60(%eax) +; FALLBACK22-NEXT: movl %edx, 56(%eax) +; FALLBACK22-NEXT: movl %ecx, 48(%eax) +; FALLBACK22-NEXT: movl %esi, 52(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -14873,109 +14850,107 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: leal (,%ecx,8), %edx ; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %edi -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: notb %dl ; FALLBACK26-NEXT: leal (%eax,%eax), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: shlxl %edx, %ebp, %ebp ; FALLBACK26-NEXT: orl %edi, %ebp ; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %esi ; FALLBACK26-NEXT: orl %edi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: orl %eax, %ebp +; FALLBACK26-NEXT: movl 120(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %edx, %eax, %esi ; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: shrxl %ebx, %eax, %edi ; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK26-NEXT: shlxl %edx, %eax, %eax +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edx +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %edi, %edx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %edi ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: movl %edx, 60(%ecx) -; FALLBACK26-NEXT: movl %ebx, 56(%ecx) -; FALLBACK26-NEXT: movl %edi, 48(%ecx) +; FALLBACK26-NEXT: movl %edi, 60(%ecx) +; FALLBACK26-NEXT: movl %edx, 56(%ecx) +; FALLBACK26-NEXT: movl %eax, 48(%ecx) ; FALLBACK26-NEXT: movl %esi, 52(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 40(%ecx) +; FALLBACK26-NEXT: movl %ebp, 40(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 44(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -15430,115 +15405,113 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK30-NEXT: movl (%eax), %edx +; FALLBACK30-NEXT: movl (%eax), %ecx ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%edx,8), %ecx -; FALLBACK30-NEXT: andl $24, %ecx -; FALLBACK30-NEXT: andl $60, %edx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi -; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax +; FALLBACK30-NEXT: leal (,%ecx,8), %edx +; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: andl $60, %ecx +; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK30-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK30-NEXT: movl %ecx, %ebx -; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: notb %dl ; FALLBACK30-NEXT: leal (%eax,%eax), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: shlxl %edx, %ebp, %ebp ; FALLBACK30-NEXT: orl %edi, %ebp ; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi +; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: shlxl %edx, %esi, %esi ; FALLBACK30-NEXT: orl %edi, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi -; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax -; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi +; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: orl %eax, %ebp +; FALLBACK30-NEXT: movl 120(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %edx, %eax, %esi +; FALLBACK30-NEXT: movl 116(%esp,%ecx), %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %edi ; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax -; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx -; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp -; FALLBACK30-NEXT: leal (%edx,%edx), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx -; FALLBACK30-NEXT: orl %eax, %edx +; FALLBACK30-NEXT: shlxl %edx, %eax, %eax +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl 124(%esp,%ecx), %ecx +; FALLBACK30-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edx +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %edi, %edx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %edi ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: movl %ebp, 60(%ecx) +; FALLBACK30-NEXT: movl %edi, 60(%ecx) ; FALLBACK30-NEXT: movl %edx, 56(%ecx) -; FALLBACK30-NEXT: movl %edi, 48(%ecx) +; FALLBACK30-NEXT: movl %eax, 48(%ecx) ; FALLBACK30-NEXT: movl %esi, 52(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 40(%ecx) +; FALLBACK30-NEXT: movl %ebp, 40(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 44(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -16196,10 +16169,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: shl_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -16227,62 +16198,60 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rsi,8), %eax ; FALLBACK2-NEXT: andl $56, %eax +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andl $56, %esi ; FALLBACK2-NEXT: negl %esi ; FALLBACK2-NEXT: movslq %esi, %rsi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10 -; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9 -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14 -; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shlxq %rax, %r8, %r15 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r12 -; FALLBACK2-NEXT: movl %eax, %r13d -; FALLBACK2-NEXT: notb %r13b -; FALLBACK2-NEXT: shrq %r10 -; FALLBACK2-NEXT: shrxq %r13, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp -; FALLBACK2-NEXT: shrq %r14 -; FALLBACK2-NEXT: shrxq %r13, %r14, %r14 -; FALLBACK2-NEXT: orq %r11, %r14 -; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx -; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r9 +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %r8 +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r10 ; FALLBACK2-NEXT: shrq %r9 -; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 -; FALLBACK2-NEXT: orq %r15, %r9 +; FALLBACK2-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK2-NEXT: orq %r8, %r9 +; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %rbx +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r14 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %rbx, %r8 ; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi -; FALLBACK2-NEXT: orq %rbp, %rdi +; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r14, %rdi +; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rbx +; FALLBACK2-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r15 +; FALLBACK2-NEXT: shlxq %rcx, %r15, %r12 +; FALLBACK2-NEXT: shrq %r15 +; FALLBACK2-NEXT: shrxq %rax, %r15, %r15 +; FALLBACK2-NEXT: orq %r14, %r15 +; FALLBACK2-NEXT: shrq %r11 +; FALLBACK2-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK2-NEXT: orq %r12, %r11 +; FALLBACK2-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 -; FALLBACK2-NEXT: orq %rax, %r8 -; FALLBACK2-NEXT: movq %r12, (%rdx) -; FALLBACK2-NEXT: movq %r8, 48(%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r14, %rsi +; FALLBACK2-NEXT: shrq %rbx +; FALLBACK2-NEXT: shrxq %rax, %rbx, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) +; FALLBACK2-NEXT: movq %rax, 48(%rdx) ; FALLBACK2-NEXT: movq %rsi, 56(%rdx) -; FALLBACK2-NEXT: movq %rdi, 32(%rdx) -; FALLBACK2-NEXT: movq %r9, 40(%rdx) -; FALLBACK2-NEXT: movq %rcx, 16(%rdx) -; FALLBACK2-NEXT: movq %r14, 24(%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) +; FALLBACK2-NEXT: movq %r11, 32(%rdx) +; FALLBACK2-NEXT: movq %r15, 40(%rdx) +; FALLBACK2-NEXT: movq %rdi, 16(%rdx) +; FALLBACK2-NEXT: movq %r8, 24(%rdx) +; FALLBACK2-NEXT: movq %r9, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_64bytes: @@ -16509,86 +16478,81 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: shl_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: subq $24, %rsp +; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK6-NEXT: movl (%rsi), %eax +; FALLBACK6-NEXT: movl (%rsi), %esi ; FALLBACK6-NEXT: xorps %xmm4, %xmm4 ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm3, (%rsp) +; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %ecx -; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: leal (,%rsi,8), %eax ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: negl %eax -; FALLBACK6-NEXT: movslq %eax, %rsi -; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK6-NEXT: movl %ecx, %r9d -; FALLBACK6-NEXT: notb %r9b +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: negl %esi +; FALLBACK6-NEXT: movslq %esi, %rsi +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %r9, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK6-NEXT: shrq %r9 +; FALLBACK6-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK6-NEXT: orq %r10, %r9 +; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK6-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK6-NEXT: shrq %r10 +; FALLBACK6-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %r11, %r10 +; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK6-NEXT: shrq %r11 +; FALLBACK6-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %r14, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK6-NEXT: shrq %rbx +; FALLBACK6-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK6-NEXT: orq %r15, %rbx +; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %r12, %rdi -; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK6-NEXT: shrq %r13 -; FALLBACK6-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK6-NEXT: orq %r15, %r12 -; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK6-NEXT: shrq %r11 -; FALLBACK6-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK6-NEXT: shrq %r14 -; FALLBACK6-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK6-NEXT: orq %r10, %r14 -; FALLBACK6-NEXT: shrq %rsi -; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK6-NEXT: orq %rbx, %rsi -; FALLBACK6-NEXT: shrq %rax -; FALLBACK6-NEXT: shrxq %r9, %rax, %rax -; FALLBACK6-NEXT: orq %r8, %rax -; FALLBACK6-NEXT: shrq %rbp -; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK6-NEXT: orq %r15, %r8 -; FALLBACK6-NEXT: movq %rcx, (%rdx) -; FALLBACK6-NEXT: movq %r8, 56(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %rsi, 8(%rdx) -; FALLBACK6-NEXT: movq %r14, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r12, 32(%rdx) -; FALLBACK6-NEXT: movq %rdi, 40(%rdx) -; FALLBACK6-NEXT: addq $24, %rsp +; FALLBACK6-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK6-NEXT: shrq %r15 +; FALLBACK6-NEXT: shrxq %rax, %r15, %rax +; FALLBACK6-NEXT: orq %rcx, %rax +; FALLBACK6-NEXT: movq %r14, (%rdx) +; FALLBACK6-NEXT: movq %rax, 56(%rdx) +; FALLBACK6-NEXT: movq %rdi, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 8(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, 24(%rdx) +; FALLBACK6-NEXT: movq %r9, 32(%rdx) +; FALLBACK6-NEXT: movq %r8, 40(%rdx) +; FALLBACK6-NEXT: addq $8, %rsp ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_64bytes: @@ -16798,80 +16762,75 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: shl_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: subq $24, %rsp +; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: movl (%rsi), %esi ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %ecx -; FALLBACK10-NEXT: andl $56, %ecx +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: negl %eax -; FALLBACK10-NEXT: movslq %eax, %rsi -; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK10-NEXT: movl %ecx, %r9d -; FALLBACK10-NEXT: notb %r9b +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: negl %esi +; FALLBACK10-NEXT: movslq %esi, %rsi +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %r9, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK10-NEXT: shrq %r9 +; FALLBACK10-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK10-NEXT: orq %r10, %r9 +; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK10-NEXT: shrq %r10 +; FALLBACK10-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %r11, %r10 +; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK10-NEXT: shrq %r11 +; FALLBACK10-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %r14, %r11 +; FALLBACK10-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK10-NEXT: shrq %rbx +; FALLBACK10-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK10-NEXT: orq %r15, %rbx +; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK10-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %r12, %rdi -; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK10-NEXT: shrq %r13 -; FALLBACK10-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK10-NEXT: orq %r15, %r12 -; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK10-NEXT: shrq %r11 -; FALLBACK10-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK10-NEXT: shrq %r14 -; FALLBACK10-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK10-NEXT: orq %r10, %r14 -; FALLBACK10-NEXT: shrq %rsi -; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK10-NEXT: orq %rbx, %rsi -; FALLBACK10-NEXT: shrq %rax -; FALLBACK10-NEXT: shrxq %r9, %rax, %rax -; FALLBACK10-NEXT: orq %r8, %rax -; FALLBACK10-NEXT: shrq %rbp -; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK10-NEXT: orq %r15, %r8 -; FALLBACK10-NEXT: movq %rcx, (%rdx) -; FALLBACK10-NEXT: movq %r8, 56(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %rsi, 8(%rdx) -; FALLBACK10-NEXT: movq %r14, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r12, 32(%rdx) -; FALLBACK10-NEXT: movq %rdi, 40(%rdx) -; FALLBACK10-NEXT: addq $24, %rsp +; FALLBACK10-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK10-NEXT: shrq %r15 +; FALLBACK10-NEXT: shrxq %rax, %r15, %rax +; FALLBACK10-NEXT: orq %rcx, %rax +; FALLBACK10-NEXT: movq %r14, (%rdx) +; FALLBACK10-NEXT: movq %rax, 56(%rdx) +; FALLBACK10-NEXT: movq %rdi, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, 24(%rdx) +; FALLBACK10-NEXT: movq %r9, 32(%rdx) +; FALLBACK10-NEXT: movq %r8, 40(%rdx) +; FALLBACK10-NEXT: addq $8, %rsp ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -17071,77 +17030,72 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: shl_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: subq $24, %rsp +; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK14-NEXT: movl (%rsi), %eax +; FALLBACK14-NEXT: movl (%rsi), %esi ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: leal (,%rsi,8), %eax ; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: negl %eax -; FALLBACK14-NEXT: movslq %eax, %rsi -; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK14-NEXT: movl %ecx, %r9d -; FALLBACK14-NEXT: notb %r9b +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: negl %esi +; FALLBACK14-NEXT: movslq %esi, %rsi +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %r9, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK14-NEXT: shrq %r9 +; FALLBACK14-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK14-NEXT: orq %r10, %r9 +; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK14-NEXT: shrq %r10 +; FALLBACK14-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %r11, %r10 +; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK14-NEXT: shrq %r11 +; FALLBACK14-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %r14, %r11 +; FALLBACK14-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK14-NEXT: shrq %rbx +; FALLBACK14-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK14-NEXT: orq %r15, %rbx +; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK14-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %r12, %rdi -; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK14-NEXT: shrq %r13 -; FALLBACK14-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK14-NEXT: orq %r15, %r12 -; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: shrq %r11 -; FALLBACK14-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK14-NEXT: shrq %r14 -; FALLBACK14-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK14-NEXT: orq %r10, %r14 -; FALLBACK14-NEXT: shrq %rsi -; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK14-NEXT: orq %rbx, %rsi -; FALLBACK14-NEXT: shrq %rax -; FALLBACK14-NEXT: shrxq %r9, %rax, %rax -; FALLBACK14-NEXT: orq %r8, %rax -; FALLBACK14-NEXT: shrq %rbp -; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK14-NEXT: orq %r15, %r8 -; FALLBACK14-NEXT: movq %rcx, (%rdx) -; FALLBACK14-NEXT: movq %r8, 56(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %rsi, 8(%rdx) -; FALLBACK14-NEXT: movq %r14, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r12, 32(%rdx) -; FALLBACK14-NEXT: movq %rdi, 40(%rdx) -; FALLBACK14-NEXT: addq $24, %rsp +; FALLBACK14-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK14-NEXT: shrq %r15 +; FALLBACK14-NEXT: shrxq %rax, %r15, %rax +; FALLBACK14-NEXT: orq %rcx, %rax +; FALLBACK14-NEXT: movq %r14, (%rdx) +; FALLBACK14-NEXT: movq %rax, 56(%rdx) +; FALLBACK14-NEXT: movq %rdi, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, 24(%rdx) +; FALLBACK14-NEXT: movq %r9, 32(%rdx) +; FALLBACK14-NEXT: movq %r8, 40(%rdx) +; FALLBACK14-NEXT: addq $8, %rsp ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -17681,144 +17635,149 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: leal (,%ebp,8), %edx -; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: leal (,%ebp,8), %ebx +; FALLBACK18-NEXT: andl $24, %ebx +; FALLBACK18-NEXT: movl %ebx, %eax ; FALLBACK18-NEXT: andl $60, %ebp ; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK18-NEXT: subl %ebp, %edi -; FALLBACK18-NEXT: movl (%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: subl %ebp, %edx +; FALLBACK18-NEXT: movl (%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%edx), %ecx ; FALLBACK18-NEXT: notb %bl -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK18-NEXT: orl %ecx, %esi +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK18-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: orl %esi, %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%edx), %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%edi), %esi -; FALLBACK18-NEXT: movl %esi, %ecx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 12(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 20(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 12(%edx), %esi +; FALLBACK18-NEXT: movl %ebp, %edi +; FALLBACK18-NEXT: shlxl %ebp, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK18-NEXT: orl %eax, %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%edi), %ecx +; FALLBACK18-NEXT: movl 16(%edx), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 28(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: movl 20(%edx), %ecx +; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%edi), %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 36(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: movl 24(%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 28(%edx), %esi +; FALLBACK18-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%edi), %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 32(%edx), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 44(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: movl 36(%edx), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %edi, %eax ; FALLBACK18-NEXT: shrl %esi ; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%edi), %esi +; FALLBACK18-NEXT: orl %ebp, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 40(%edx), %edi +; FALLBACK18-NEXT: movl %edi, %esi ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 52(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK18-NEXT: movl 44(%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %edi, %edi +; FALLBACK18-NEXT: movl %eax, %esi +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: shrl %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK18-NEXT: orl %eax, %ebp -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl 48(%edx), %ebp +; FALLBACK18-NEXT: movl %ebp, %edi +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 52(%edx), %ecx +; FALLBACK18-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK18-NEXT: movl %esi, %ebp ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: negl %eax -; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK18-NEXT: movl 56(%edi), %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %edx -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %edx, %esi ; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, (%eax) -; FALLBACK18-NEXT: movl %esi, 56(%eax) -; FALLBACK18-NEXT: movl %ecx, 60(%eax) -; FALLBACK18-NEXT: movl %ebp, 48(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 40(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 44(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 32(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 36(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 24(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 28(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK18-NEXT: orl %edi, %esi +; FALLBACK18-NEXT: movl 56(%edx), %edi +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK18-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK18-NEXT: negl %ebx +; FALLBACK18-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK18-NEXT: orl %ecx, %ebx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: movl %edi, (%edx) +; FALLBACK18-NEXT: movl %eax, 56(%edx) +; FALLBACK18-NEXT: movl %ebx, 60(%edx) +; FALLBACK18-NEXT: movl %esi, 48(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 52(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 40(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 44(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 32(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 36(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 24(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 28(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edx) ; FALLBACK18-NEXT: addl $204, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -18342,144 +18301,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%eax,8), %edx -; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: leal (,%eax,8), %ebx +; FALLBACK22-NEXT: andl $24, %ebx +; FALLBACK22-NEXT: movl %ebx, %ecx ; FALLBACK22-NEXT: andl $60, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK22-NEXT: subl %eax, %edi -; FALLBACK22-NEXT: movl (%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 4(%edi), %eax +; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: subl %eax, %edx +; FALLBACK22-NEXT: movl (%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 4(%edx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK22-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 8(%edx), %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 8(%edi), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 12(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 16(%edi), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 20(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 12(%edx), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ecx, %edi +; FALLBACK22-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: orl %eax, %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 24(%edi), %ecx +; FALLBACK22-NEXT: movl 16(%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 28(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: movl 20(%edx), %ecx +; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 32(%edi), %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 36(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: movl 24(%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 28(%edx), %esi +; FALLBACK22-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 40(%edi), %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 32(%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 44(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: movl 36(%edx), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, %eax ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%edi), %esi +; FALLBACK22-NEXT: orl %ebp, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 40(%edx), %edi +; FALLBACK22-NEXT: movl %edi, %esi ; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK22-NEXT: movl 52(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK22-NEXT: movl 44(%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %edi, %edi +; FALLBACK22-NEXT: movl %eax, %esi +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK22-NEXT: orl %eax, %ebp -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 48(%edx), %ebp +; FALLBACK22-NEXT: movl %ebp, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 52(%edx), %ecx +; FALLBACK22-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK22-NEXT: movl %esi, %ebp ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: negl %eax -; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK22-NEXT: movl 56(%edi), %eax -; FALLBACK22-NEXT: shlxl %edx, %eax, %edx -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edx, %esi ; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK22-NEXT: movl %edx, (%eax) -; FALLBACK22-NEXT: movl %esi, 56(%eax) -; FALLBACK22-NEXT: movl %ecx, 60(%eax) -; FALLBACK22-NEXT: movl %ebp, 48(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 40(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 44(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 32(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 36(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 24(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 28(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 16(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 20(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 8(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 12(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 4(%eax) +; FALLBACK22-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl 56(%edx), %edi +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK22-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK22-NEXT: negl %ebx +; FALLBACK22-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK22-NEXT: orl %ecx, %ebx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %edi, (%edx) +; FALLBACK22-NEXT: movl %eax, 56(%edx) +; FALLBACK22-NEXT: movl %ebx, 60(%edx) +; FALLBACK22-NEXT: movl %esi, 48(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 52(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 40(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 44(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 32(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 36(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 24(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 28(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 16(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 12(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 4(%edx) ; FALLBACK22-NEXT: addl $204, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -18943,144 +18908,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: leal (,%eax,8), %edx -; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: leal (,%eax,8), %ebx +; FALLBACK26-NEXT: andl $24, %ebx +; FALLBACK26-NEXT: movl %ebx, %ecx ; FALLBACK26-NEXT: andl $60, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK26-NEXT: subl %eax, %edi -; FALLBACK26-NEXT: movl (%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 4(%edi), %eax +; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: subl %eax, %edx +; FALLBACK26-NEXT: movl (%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 4(%edx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK26-NEXT: orl %ecx, %esi +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 8(%edx), %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 8(%edi), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 12(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 16(%edi), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 20(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 12(%edx), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %ecx, %edi +; FALLBACK26-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: orl %eax, %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 24(%edi), %ecx +; FALLBACK26-NEXT: movl 16(%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 28(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: movl 20(%edx), %ecx +; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 32(%edi), %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 36(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: movl 24(%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 28(%edx), %esi +; FALLBACK26-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 40(%edi), %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 32(%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 44(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: movl 36(%edx), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, %eax ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%edi), %esi +; FALLBACK26-NEXT: orl %ebp, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 40(%edx), %edi +; FALLBACK26-NEXT: movl %edi, %esi ; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl 52(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK26-NEXT: movl 44(%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %edi, %edi +; FALLBACK26-NEXT: movl %eax, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: shrl %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK26-NEXT: orl %eax, %ebp -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl 48(%edx), %ebp +; FALLBACK26-NEXT: movl %ebp, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 52(%edx), %ecx +; FALLBACK26-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK26-NEXT: movl %esi, %ebp ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: negl %eax -; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK26-NEXT: movl 56(%edi), %eax -; FALLBACK26-NEXT: shlxl %edx, %eax, %edx -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edx, %esi ; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK26-NEXT: movl %edx, (%eax) -; FALLBACK26-NEXT: movl %esi, 56(%eax) -; FALLBACK26-NEXT: movl %ecx, 60(%eax) -; FALLBACK26-NEXT: movl %ebp, 48(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 40(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 44(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 32(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 36(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 24(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 28(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 16(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 20(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 8(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 12(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 4(%eax) +; FALLBACK26-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl 56(%edx), %edi +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK26-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK26-NEXT: negl %ebx +; FALLBACK26-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK26-NEXT: orl %ecx, %ebx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %edi, (%edx) +; FALLBACK26-NEXT: movl %eax, 56(%edx) +; FALLBACK26-NEXT: movl %ebx, 60(%edx) +; FALLBACK26-NEXT: movl %esi, 48(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 52(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 40(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 44(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 32(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 36(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 24(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 28(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 16(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 8(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 12(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 4(%edx) ; FALLBACK26-NEXT: addl $204, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -19531,144 +19502,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%eax,8), %edx -; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: leal (,%eax,8), %ebx +; FALLBACK30-NEXT: andl $24, %ebx +; FALLBACK30-NEXT: movl %ebx, %ecx ; FALLBACK30-NEXT: andl $60, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK30-NEXT: subl %eax, %edi -; FALLBACK30-NEXT: movl (%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 4(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK30-NEXT: orl %ecx, %esi +; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: subl %eax, %edx +; FALLBACK30-NEXT: movl (%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 8(%edi), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 12(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 16(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 20(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 24(%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 28(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl 4(%edx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 8(%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 32(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 36(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK30-NEXT: movl 12(%edx), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %ecx, %edi +; FALLBACK30-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: orl %eax, %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 40(%edi), %ecx +; FALLBACK30-NEXT: movl 16(%edx), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 44(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK30-NEXT: movl 20(%edx), %ecx +; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%edi), %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 24(%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 52(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: movl 28(%edx), %esi +; FALLBACK30-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK30-NEXT: orl %eax, %ebp -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: negl %eax -; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK30-NEXT: movl 56(%edi), %eax -; FALLBACK30-NEXT: shlxl %edx, %eax, %edx +; FALLBACK30-NEXT: movl 32(%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: movl 36(%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, %eax ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edx, %esi +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 40(%edx), %edi +; FALLBACK30-NEXT: movl %edi, %esi +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK30-NEXT: movl 44(%edx), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %edi, %edi +; FALLBACK30-NEXT: movl %eax, %esi +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: shrl %eax ; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK30-NEXT: movl %edx, (%eax) -; FALLBACK30-NEXT: movl %esi, 56(%eax) -; FALLBACK30-NEXT: movl %ecx, 60(%eax) -; FALLBACK30-NEXT: movl %ebp, 48(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 40(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 44(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 32(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 36(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 24(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 28(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 16(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 20(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 8(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 12(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 4(%eax) +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%edx), %ebp +; FALLBACK30-NEXT: movl %ebp, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 52(%edx), %ecx +; FALLBACK30-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK30-NEXT: movl %esi, %ebp +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: shrl %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl 56(%edx), %edi +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK30-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK30-NEXT: negl %ebx +; FALLBACK30-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK30-NEXT: orl %ecx, %ebx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %edi, (%edx) +; FALLBACK30-NEXT: movl %eax, 56(%edx) +; FALLBACK30-NEXT: movl %ebx, 60(%edx) +; FALLBACK30-NEXT: movl %esi, 48(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 52(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 40(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 44(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 32(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 36(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 24(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 28(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 16(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 12(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 4(%edx) ; FALLBACK30-NEXT: addl $204, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -20336,10 +20313,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: ashr_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -20371,60 +20346,58 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: movl %ecx, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 +; FALLBACK2-NEXT: notb %cl +; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK2-NEXT: orq %r10, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 +; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 +; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 +; FALLBACK2-NEXT: orq %rbx, %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK2-NEXT: orq %rbx, %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 +; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx +; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK2-NEXT: movq %rax, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 48(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r12, 40(%rdx) +; FALLBACK2-NEXT: movq %r11, 16(%rdx) +; FALLBACK2-NEXT: movq %r9, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_64bytes: @@ -20664,13 +20637,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: ashr_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 @@ -20691,62 +20662,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: leal (,%rax,8), %ecx +; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: movl %ecx, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK6-NEXT: notb %cl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r8, %rdi +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK6-NEXT: addq %r11, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK6-NEXT: orq %rbx, %r11 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK6-NEXT: orq %r15, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK6-NEXT: addq %rbx, %rbx +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK6-NEXT: orq %r14, %rbx +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK6-NEXT: orq %r14, %r15 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK6-NEXT: orq %r10, %rcx +; FALLBACK6-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK6-NEXT: movq %rax, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: movq %r15, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 32(%rdx) +; FALLBACK6-NEXT: movq %r13, 40(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_64bytes: @@ -20979,13 +20948,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: ashr_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK10-NEXT: movq 48(%rdi), %rcx @@ -21004,62 +20971,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: leal (,%rax,8), %ecx +; FALLBACK10-NEXT: andl $56, %ecx +; FALLBACK10-NEXT: movl %ecx, %esi ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 +; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK10-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK10-NEXT: orq %r8, %rdi +; FALLBACK10-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK10-NEXT: addq %r11, %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rbx, %r11 +; FALLBACK10-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK10-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK10-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK10-NEXT: orq %r15, %r13 +; FALLBACK10-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK10-NEXT: addq %rbx, %rbx +; FALLBACK10-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK10-NEXT: orq %r14, %rbx +; FALLBACK10-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) +; FALLBACK10-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK10-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK10-NEXT: orq %r14, %r15 +; FALLBACK10-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %r10, %rcx +; FALLBACK10-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 56(%rdx) ; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: movq %r15, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 32(%rdx) +; FALLBACK10-NEXT: movq %r13, 40(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -21292,13 +21257,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: ashr_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK14-NEXT: movq 48(%rdi), %rcx @@ -21317,62 +21280,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %esi -; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: leal (,%rax,8), %ecx +; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: movl %ecx, %esi ; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK14-NEXT: movl %esi, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK14-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK14-NEXT: orq %r8, %rdi +; FALLBACK14-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK14-NEXT: addq %r11, %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rbx, %r11 +; FALLBACK14-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK14-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK14-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK14-NEXT: orq %r15, %r13 +; FALLBACK14-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK14-NEXT: addq %rbx, %rbx +; FALLBACK14-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK14-NEXT: orq %r14, %rbx +; FALLBACK14-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %r13, %rax -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK14-NEXT: orq %rbp, %rcx -; FALLBACK14-NEXT: movq %rsi, 56(%rdx) +; FALLBACK14-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK14-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK14-NEXT: orq %r14, %r15 +; FALLBACK14-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %r10, %rcx +; FALLBACK14-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 56(%rdx) ; FALLBACK14-NEXT: movq %rcx, 8(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: movq %r15, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 32(%rdx) +; FALLBACK14-NEXT: movq %r13, 40(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -21960,111 +21921,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: leal (,%eax,8), %edx ; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: movl %edx, %ebx ; FALLBACK18-NEXT: andl $60, %ecx ; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%edi,%edi), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %ecx, %ebp +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK18-NEXT: orl %ebp, %esi +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK18-NEXT: leal (%eax,%eax), %ebp +; FALLBACK18-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK18-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl %edi, 60(%eax) +; FALLBACK18-NEXT: movl %edx, 56(%eax) +; FALLBACK18-NEXT: movl %ecx, 48(%eax) +; FALLBACK18-NEXT: movl %esi, 52(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -22664,111 +22626,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl %eax, %ecx ; FALLBACK22-NEXT: leal (,%eax,8), %edx ; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: andl $60, %ecx ; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: notb %dl ; FALLBACK22-NEXT: leal (%edi,%edi), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax ; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %ebp +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax ; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %ecx, %edi -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK22-NEXT: leal (%edi,%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %ebp +; FALLBACK22-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK22-NEXT: orl %edi, %edx +; FALLBACK22-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl %edi, 60(%eax) +; FALLBACK22-NEXT: movl %edx, 56(%eax) +; FALLBACK22-NEXT: movl %ecx, 48(%eax) +; FALLBACK22-NEXT: movl %esi, 52(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -23326,111 +23289,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl %eax, %ecx ; FALLBACK26-NEXT: leal (,%eax,8), %edx ; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: notb %dl ; FALLBACK26-NEXT: leal (%edi,%edi), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK26-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl %ecx, %ebp +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %ecx, %edi -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK26-NEXT: leal (%edi,%edi), %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK26-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK26-NEXT: orl %ebp, %esi +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK26-NEXT: addl %ebp, %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK26-NEXT: leal (%eax,%eax), %ebp +; FALLBACK26-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK26-NEXT: orl %edi, %edx +; FALLBACK26-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl %edx, 60(%eax) -; FALLBACK26-NEXT: movl %ebx, 56(%eax) -; FALLBACK26-NEXT: movl %edi, 48(%eax) -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl %esi, 40(%eax) +; FALLBACK26-NEXT: movl %edi, 60(%eax) +; FALLBACK26-NEXT: movl %edx, 56(%eax) +; FALLBACK26-NEXT: movl %ecx, 48(%eax) +; FALLBACK26-NEXT: movl %esi, 52(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 40(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 44(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -23988,111 +23952,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl %eax, %ecx ; FALLBACK30-NEXT: leal (,%eax,8), %edx ; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: movl %edx, %ebx ; FALLBACK30-NEXT: andl $60, %ecx ; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: notb %dl ; FALLBACK30-NEXT: leal (%edi,%edi), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK30-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax ; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl %ecx, %ebp +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax ; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %ecx, %edi -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK30-NEXT: shrxl %edx, %eax, %edi -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK30-NEXT: leal (%edi,%edi), %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK30-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK30-NEXT: addl %ebp, %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK30-NEXT: orl %eax, %ebx +; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK30-NEXT: leal (%eax,%eax), %ebp +; FALLBACK30-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK30-NEXT: orl %edi, %edx +; FALLBACK30-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl %edx, 60(%eax) -; FALLBACK30-NEXT: movl %ebx, 56(%eax) -; FALLBACK30-NEXT: movl %edi, 48(%eax) -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl %esi, 40(%eax) +; FALLBACK30-NEXT: movl %edi, 60(%eax) +; FALLBACK30-NEXT: movl %edx, 56(%eax) +; FALLBACK30-NEXT: movl %ecx, 48(%eax) +; FALLBACK30-NEXT: movl %esi, 52(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 40(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 44(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 338e104fbe8f0..221a51ed44696 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -712,33 +712,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -994,42 +994,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1297,33 +1297,33 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1487,31 +1487,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: @@ -1761,88 +1761,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2040,32 +2042,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; @@ -2319,97 +2321,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, 92(%esp,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2610,31 +2616,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: @@ -2927,60 +2933,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 32(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3263,13 +3268,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -3292,65 +3295,63 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: @@ -3868,20 +3869,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -3906,116 +3907,117 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4388,10 +4390,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax @@ -4419,63 +4419,61 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes: @@ -4972,33 +4970,33 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 @@ -5011,7 +5009,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5032,149 +5030,152 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 60(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -5534,13 +5535,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -5567,65 +5566,63 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: @@ -6221,33 +6218,31 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi @@ -6256,87 +6251,84 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index c3054a365c466..6b5c6049f025b 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -1635,22 +1635,22 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1807,40 +1807,43 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1906,13 +1909,13 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movb %cl, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2070,13 +2073,13 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movw %cx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2233,13 +2236,13 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movl %ecx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2521,10 +2524,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -2541,25 +2545,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp,%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> @@ -2667,21 +2672,21 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r10, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq @@ -2860,33 +2865,33 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 16(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 8(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp @@ -3026,9 +3031,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 @@ -3043,38 +3046,36 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -3304,7 +3305,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 @@ -3320,59 +3321,60 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 32(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) @@ -3380,7 +3382,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 7735500bd3a88..bed8e5806380c 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -1879,22 +1879,22 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: @@ -2055,40 +2055,43 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll index 4d261a9810896..9fbbba2ed3b47 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -820,7 +820,7 @@ define void @infiniteloop() { ; ENABLE-NEXT: movq %rsp, %rax ; ENABLE-NEXT: addq $-16, %rax ; ENABLE-NEXT: movq %rax, %rsp -; ENABLE-NEXT: xorl %ecx, %ecx +; ENABLE-NEXT: xorl %ecx, %ecx ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB10_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -851,8 +851,8 @@ define void @infiniteloop() { ; DISABLE-NEXT: ## %bb.1: ## %if.then ; DISABLE-NEXT: movq %rsp, %rax ; DISABLE-NEXT: addq $-16, %rax -; DISABLE-NEXT: %rax, %rsp -; DISABLE-NEXT: xorl %ecx, %ecx +; DISABLE-NEXT: movq %rax, %rsp +; DISABLE-NEXT: xorl %ecx, %ecx ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB10_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -1185,10 +1185,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 { ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB14_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; ENABLE-NEXT: cmpl %esi, %edi -; ENABLE-NEXT: setl %al +; ENABLE-NEXT: movl %esi, %eax ; ENABLE-NEXT: xorl %esi, %esi -; ENABLE-NEXT: movb %al, %sil +; ENABLE-NEXT: cmpl %eax, %edi +; ENABLE-NEXT: setl %sil ; ENABLE-NEXT: incb %dl ; ENABLE-NEXT: cmpb $45, %dl ; ENABLE-NEXT: jl LBB14_2 @@ -1220,10 +1220,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 { ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB14_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; DISABLE-NEXT: cmpl %esi, %edi -; DISABLE-NEXT: setl %al +; DISABLE-NEXT: movl %esi, %eax ; DISABLE-NEXT: xorl %esi, %esi -; DISABLE-NEXT: movb %al, %sil +; DISABLE-NEXT: cmpl %eax, %edi +; DISABLE-NEXT: setl %sil ; DISABLE-NEXT: incb %dl ; DISABLE-NEXT: cmpb $45, %dl ; DISABLE-NEXT: jl LBB14_2 diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index 2bef66825d8c0..59fbf7183abc6 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -62,12 +62,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB3_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -78,12 +78,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB3_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB3_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -94,12 +94,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB3_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB3_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -126,13 +126,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB4_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: testw %dx, %dx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: testw %cx, %cx ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -144,13 +144,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB4_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: testw %cx, %cx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %esi, %ecx +; X64-LIN-NEXT: xorl %ecx, %eax +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi +; X64-LIN-NEXT: testw %si, %si ; X64-LIN-NEXT: jne .LBB4_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -163,13 +163,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB4_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: testw %cx, %cx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %edx, %ecx +; X64-WIN-NEXT: xorl %ecx, %eax +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx +; X64-WIN-NEXT: testw %dx, %dx ; X64-WIN-NEXT: jne .LBB4_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -197,12 +197,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB5_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorb %cl, %al -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notb %dl -; X86-NEXT: andb %cl, %dl -; X86-NEXT: addb %dl, %dl -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notb %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: addb %cl, %cl ; X86-NEXT: jne .LBB5_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -213,12 +213,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB5_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorb %sil, %al -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notb %cl -; X64-LIN-NEXT: andb %sil, %cl -; X64-LIN-NEXT: addb %cl, %cl -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notb %sil +; X64-LIN-NEXT: andb %cl, %sil +; X64-LIN-NEXT: addb %sil, %sil ; X64-LIN-NEXT: jne .LBB5_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $al killed $al killed $eax @@ -230,12 +230,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB5_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorb %dl, %al -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notb %cl -; X64-WIN-NEXT: andb %dl, %cl -; X64-WIN-NEXT: addb %cl, %cl -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notb %dl +; X64-WIN-NEXT: andb %cl, %dl +; X64-WIN-NEXT: addb %dl, %dl ; X64-WIN-NEXT: jne .LBB5_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -262,12 +262,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB6_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -278,12 +278,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB6_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: xorl $2147483646, %esi # imm = 0x7FFFFFFE +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB6_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -294,12 +294,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB6_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB6_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll index c12d8135e5eba..082b876b542e5 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -234,16 +234,17 @@ define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %r ; X32-NEXT: .p2align 4 ; X32-NEXT: .LBB2_2: # %for.body ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl (%ebx,%esi), %ebp -; X32-NEXT: addl (%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: movl %ebp, (%edx) -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: movl (%ebx,%esi), %ebx +; X32-NEXT: addl (%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: movl %ebx, (%edx) +; X32-NEXT: leal (%ebp,%esi), %ebx ; X32-NEXT: addl %edi, %ebx ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: decl %eax From 2fc2e1f95156f86dead18f56233965f774fb551f Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Mon, 10 Nov 2025 12:33:28 -0800 Subject: [PATCH 04/30] [LLDB] Fix darwin shell tests under ASAN --- lldb/test/Shell/helper/toolchain.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py index 728f6347242f1..faa29d23387cc 100644 --- a/lldb/test/Shell/helper/toolchain.py +++ b/lldb/test/Shell/helper/toolchain.py @@ -250,6 +250,15 @@ def use_support_substitutions(config): "-L{}".format(config.libcxx_libs_dir), "-lc++", ] + # By default, macOS doesn't allow injecting the ASAN runtime into system processes. + if platform.system() in ["Darwin"] and config.llvm_use_sanitizer: + system_clang = ( + subprocess.check_output(["xcrun", "-find", "clang"]).strip().decode("utf-8") + ) + system_liblto = os.path.join( + os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib" + ) + host_flags += ["-Wl,-lto_library", "-Wl," + system_liblto] host_flags = " ".join(host_flags) config.substitutions.append(("%clang_host", "%clang " + host_flags)) From f8e9723d17187d2b5ba867919ca2f5158fc28659 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Nov 2025 12:49:01 -0800 Subject: [PATCH 05/30] ARM: Enable terminal rule (#165958) --- llvm/lib/Target/ARM/ARMSubtarget.h | 1 + .../Thumb2/LowOverheadLoops/constbound.ll | 18 +- .../Thumb2/LowOverheadLoops/minloop.ll | 24 +- .../varying-outer-2d-reduction.ll | 34 ++- .../Thumb2/LowOverheadLoops/while-loops.ll | 91 ++++---- .../CodeGen/Thumb2/mve-float32regloops.ll | 211 +++++++++--------- .../CodeGen/Thumb2/mve-gather-increment.ll | 24 +- .../Thumb2/mve-gather-scatter-optimisation.ll | 90 ++++---- llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll | 52 ++--- llvm/test/CodeGen/Thumb2/mve-shuffle.ll | 13 +- llvm/test/CodeGen/Thumb2/mve-vld4.ll | 13 +- .../CodeGen/Thumb2/mve-vmaxnma-commute.ll | 24 +- llvm/test/CodeGen/Thumb2/mve-vst4.ll | 14 +- llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll | 2 +- 14 files changed, 302 insertions(+), 309 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 4a0883cc662e7..34baa3108402c 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -377,6 +377,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool isRWPI() const; bool useMachineScheduler() const { return UseMISched; } + bool enableTerminalRule() const override { return true; } bool useMachinePipeliner() const { return UseMIPipeliner; } bool hasMinSize() const { return OptMinSize; } bool isThumb1Only() const { return isThumb() && !hasThumb2(); } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll index 79665af17ef58..9632469261f4d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll @@ -7,22 +7,22 @@ define dso_local i32 @test_500_504(ptr nocapture readonly %x) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #126 -; CHECK-NEXT: adr r2, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: mov.w r2, #500 -; CHECK-NEXT: vdup.32 q1, r2 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: adr r1, .LCPI0_0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: mov.w r1, #500 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vqadd.u32 q2, q0, r1 -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: vqadd.u32 q2, q0, r2 +; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: vptt.u32 hi, q1, q2 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vaddvat.u32 r2, q2 +; CHECK-NEXT: vaddvat.u32 r12, q2 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll index ec257bcf123f3..bcedcd40ba112 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -28,29 +28,29 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r11, [r0, #16]! -; CHECK-NEXT: ldrd r5, r7, [r0, #-12] +; CHECK-NEXT: ldrd r5, r6, [r0, #-12] ; CHECK-NEXT: ldr r4, [r0, #-4] ; CHECK-NEXT: cmp r12, r5 ; CHECK-NEXT: csel r5, r5, r12, gt -; CHECK-NEXT: csinc r6, r10, r8, le -; CHECK-NEXT: cmp r5, r7 +; CHECK-NEXT: csinc r7, r10, r8, le +; CHECK-NEXT: cmp r5, r6 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt.w r6, r8, #2 -; CHECK-NEXT: csel r7, r7, r5, gt -; CHECK-NEXT: cmp r7, r4 +; CHECK-NEXT: addgt.w r7, r8, #2 +; CHECK-NEXT: csel r6, r6, r5, gt +; CHECK-NEXT: cmp r6, r4 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt.w r6, r8, #3 -; CHECK-NEXT: csel r7, r4, r7, gt +; CHECK-NEXT: addgt.w r7, r8, #3 +; CHECK-NEXT: csel r6, r4, r6, gt ; CHECK-NEXT: add.w r8, r8, #4 -; CHECK-NEXT: cmp r7, r11 -; CHECK-NEXT: csel r10, r8, r6, gt -; CHECK-NEXT: csel r12, r11, r7, gt +; CHECK-NEXT: cmp r6, r11 +; CHECK-NEXT: csel r10, r8, r7, gt +; CHECK-NEXT: csel r12, r11, r6, gt ; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit ; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 1769c5d2fd385..98e082be4cad1 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -21,11 +21,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: it lt ; ENABLED-NEXT: bxlt lr ; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: mov r11, r0 -; ENABLED-NEXT: ldr r0, [sp, #32] +; ENABLED-NEXT: ldr r0, [sp, #36] ; ENABLED-NEXT: add.w r9, r2, #3 ; ENABLED-NEXT: mov.w r12, #0 +; ENABLED-NEXT: mov.w r8, #1 ; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 @@ -49,18 +50,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 -; ENABLED-NEXT: movs r7, #1 -; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: sub.w r4, r2, r12 +; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: vmov.i32 q1, #0x0 -; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 +; ENABLED-NEXT: mov r7, r10 +; ENABLED-NEXT: add.w r6, r8, r0, lsr #2 ; ENABLED-NEXT: adds r0, r2, #3 ; ENABLED-NEXT: sub.w r0, r0, r12 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 -; ENABLED-NEXT: mov r7, r10 -; ENABLED-NEXT: dls lr, r0 +; ENABLED-NEXT: add.w lr, r8, r0, lsr #2 ; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -83,7 +82,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: bx lr ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: @@ -92,11 +91,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: it lt ; NOREDUCTIONS-NEXT: bxlt lr ; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: mov r11, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] +; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] ; NOREDUCTIONS-NEXT: add.w r9, r2, #3 ; NOREDUCTIONS-NEXT: mov.w r12, #0 +; NOREDUCTIONS-NEXT: mov.w r8, #1 ; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 @@ -120,18 +120,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 -; NOREDUCTIONS-NEXT: movs r7, #1 -; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 +; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 -; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: mov r7, r10 +; NOREDUCTIONS-NEXT: add.w r6, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: adds r0, r2, #3 ; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: mov r7, r10 -; NOREDUCTIONS-NEXT: dls lr, r0 +; NOREDUCTIONS-NEXT: add.w lr, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -154,7 +152,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: bx lr entry: %conv = sext i16 %N to i32 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index cbcbf1f392ce8..435acc29f076e 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -165,74 +165,73 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" { ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: adds r6, r3, #4 -; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: add.w r9, r3, #4 +; CHECK-NEXT: add.w r10, r0, #4 ; CHECK-NEXT: mvn r8, #1 -; CHECK-NEXT: @ implicit-def: $r9 +; CHECK-NEXT: @ implicit-def: $r6 ; CHECK-NEXT: @ implicit-def: $r4 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: ldr.w r1, [r10] ; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: muls r1, r3, r1 ; CHECK-NEXT: adds r4, r4, r1 ; CHECK-NEXT: adc.w r1, r2, r1, asr #31 ; CHECK-NEXT: adds.w r2, r4, #-2147483648 -; CHECK-NEXT: ldrd r2, r4, [r8] -; CHECK-NEXT: adc r5, r1, #0 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: smull r4, r2, r4, r9 -; CHECK-NEXT: asrs r1, r5, #31 +; CHECK-NEXT: ldrd r5, r4, [r8] +; CHECK-NEXT: adc r2, r1, #0 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r1, r2 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds.w r10, r4, #-2147483648 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: ldr r4, [r2, #-4] +; CHECK-NEXT: smull r4, r5, r4, r6 +; CHECK-NEXT: asrs r1, r2, #31 +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: subs r4, r2, r4 +; CHECK-NEXT: sbcs r1, r5 +; CHECK-NEXT: adds.w r6, r4, #-2147483648 +; CHECK-NEXT: ldr r4, [r10, #-4] +; CHECK-NEXT: adc r11, r1, #0 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: add.w r10, r10, #4 ; CHECK-NEXT: muls r4, r3, r4 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: adds.w r12, r4, #-2147483648 ; CHECK-NEXT: asr.w r5, r4, #31 -; CHECK-NEXT: ldr r4, [r6] +; CHECK-NEXT: ldr.w r4, [r9] ; CHECK-NEXT: adc r5, r5, #0 ; CHECK-NEXT: mul r2, r4, r0 -; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: add.w r2, r2, #-2147483648 ; CHECK-NEXT: asrl r12, r5, r2 -; CHECK-NEXT: smull r2, r5, r4, r12 -; CHECK-NEXT: lsll r2, r5, #30 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: asr.w r11, r5, #31 -; CHECK-NEXT: mov r12, r5 -; CHECK-NEXT: lsll r12, r11, r4 -; CHECK-NEXT: mul r2, r2, r9 -; CHECK-NEXT: lsrl r12, r11, #2 -; CHECK-NEXT: adds r2, #2 -; CHECK-NEXT: lsll r12, r11, r2 +; CHECK-NEXT: smull r2, r9, r4, r12 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: lsll r2, r9, #30 +; CHECK-NEXT: asr.w r5, r9, #31 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: lsll r2, r5, r4 +; CHECK-NEXT: lsrl r2, r5, #2 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: lsll r2, r5, r0 +; CHECK-NEXT: add.w r0, r2, #-2147483648 ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r12, #-2147483648 -; CHECK-NEXT: asrl r10, r1, r5 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: lsrl r10, r1, #2 -; CHECK-NEXT: movs r1, #2 -; CHECK-NEXT: mov r9, r10 -; CHECK-NEXT: str.w r10, [r1] -; CHECK-NEXT: ldr r1, [r8], #-4 -; CHECK-NEXT: mls r5, r1, r4, r5 -; CHECK-NEXT: adds.w r4, r5, #-2147483648 -; CHECK-NEXT: asr.w r1, r5, #31 +; CHECK-NEXT: asrl r6, r11, r0 +; CHECK-NEXT: movs r0, #2 +; CHECK-NEXT: lsrl r6, r11, #2 +; CHECK-NEXT: str r6, [r0] +; CHECK-NEXT: ldr r0, [r8], #-4 +; CHECK-NEXT: mls r0, r0, r4, r1 +; CHECK-NEXT: adds.w r4, r0, #-2147483648 +; CHECK-NEXT: asr.w r1, r0, #31 ; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: lsrl r4, r1, #2 -; CHECK-NEXT: rsbs r1, r4, #0 -; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: str r1, [r6, #-4] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: rsbs r0, r4, #0 +; CHECK-NEXT: str r0, [r2] +; CHECK-NEXT: str r0, [r9, #-4] +; CHECK-NEXT: add.w r9, r9, #4 +; CHECK-NEXT: add.w r0, r12, #4 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end ; CHECK-NEXT: add sp, #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index f7b4548f127bf..b6657d607ce6d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1573,120 +1573,115 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrd r7, r9, [r0] -; CHECK-NEXT: and r6, r3, #3 -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: lsrs r3, r3, #2 -; CHECK-NEXT: @ implicit-def: $r12 -; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: ldm.w r0, {r7, r9, r11} +; CHECK-NEXT: and r0, r3, #3 +; CHECK-NEXT: @ implicit-def: $r5 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: lsrs r0, r3, #2 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: b .LBB19_3 ; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r12, r10 ; CHECK-NEXT: .LBB19_2: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: strd r2, r4, [r9] -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: strd r3, r8, [r9, #8] -; CHECK-NEXT: add.w r9, r9, #16 +; CHECK-NEXT: add.w r11, r11, #128 +; CHECK-NEXT: strd r8, r0, [r9] ; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: strd r3, r12, [r9, #8] +; CHECK-NEXT: add.w r9, r9, #16 +; CHECK-NEXT: subs r7, #1 ; CHECK-NEXT: beq.w .LBB19_13 ; CHECK-NEXT: .LBB19_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB19_5 Depth 2 -; CHECK-NEXT: ldrd r5, r11, [r9] +; CHECK-NEXT: ldr.w r10, [r9, #12] ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: ldrd r8, r10, [r9, #8] -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: ldm.w r9, {r3, r4, r12} +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: wls lr, r2, .LBB19_6 +; CHECK-NEXT: wls lr, r0, .LBB19_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: .LBB19_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r5, [r1, #12] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: ldm.w r1, {r2, r7, r11} -; CHECK-NEXT: vmul.f32 q2, q2, r5 -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vfma.f32 q2, q6, r11 -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r8, r4 +; CHECK-NEXT: ldrd r4, r3, [r1, #8] +; CHECK-NEXT: vldrw.u32 q2, [r11] +; CHECK-NEXT: vldrw.u32 q6, [r11, #16] +; CHECK-NEXT: ldrd r0, r7, [r1] +; CHECK-NEXT: vmul.f32 q2, q2, r3 +; CHECK-NEXT: vldrw.u32 q7, [r11, #32] +; CHECK-NEXT: vfma.f32 q2, q6, r4 +; CHECK-NEXT: vldrw.u32 q4, [r11, #48] ; CHECK-NEXT: vfma.f32 q2, q7, r7 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vfma.f32 q2, q4, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vfma.f32 q2, q5, r3 -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vfma.f32 q2, q3, r4 -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vfma.f32 q2, q1, r8 +; CHECK-NEXT: vldrw.u32 q5, [r11, #64] +; CHECK-NEXT: vfma.f32 q2, q4, r0 +; CHECK-NEXT: vldrw.u32 q3, [r11, #80] +; CHECK-NEXT: vfma.f32 q2, q5, r5 +; CHECK-NEXT: vldrw.u32 q1, [r11, #96] +; CHECK-NEXT: vfma.f32 q2, q3, r8 +; CHECK-NEXT: vldrw.u32 q0, [r11, #112] +; CHECK-NEXT: vfma.f32 q2, q1, r12 ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vfma.f32 q2, q0, r10 -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: vmov r10, r8, d5 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: vmov r10, r12, d5 ; CHECK-NEXT: vstrb.8 q2, [r6], #16 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: mov r12, r5 ; CHECK-NEXT: le lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq .LBB19_1 ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldrd lr, r4, [r1] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldrd r2, r1, [r1, #8] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: ldrd lr, r0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r11] +; CHECK-NEXT: ldrd r8, r1, [r1, #8] +; CHECK-NEXT: vldrw.u32 q6, [r11, #16] +; CHECK-NEXT: vldrw.u32 q7, [r11, #32] +; CHECK-NEXT: vldrw.u32 q4, [r11, #48] ; CHECK-NEXT: vmul.f32 q0, q0, r1 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vfma.f32 q0, q6, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vfma.f32 q0, q7, r4 -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vldrw.u32 q5, [r11, #64] +; CHECK-NEXT: vfma.f32 q0, q6, r8 +; CHECK-NEXT: vldrw.u32 q3, [r11, #80] +; CHECK-NEXT: vfma.f32 q0, q7, r0 +; CHECK-NEXT: vldrw.u32 q2, [r11, #96] ; CHECK-NEXT: vfma.f32 q0, q4, lr -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vfma.f32 q0, q5, r5 -; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: vfma.f32 q0, q3, r11 -; CHECK-NEXT: vfma.f32 q0, q2, r8 +; CHECK-NEXT: vldrw.u32 q1, [r11, #112] +; CHECK-NEXT: vfma.f32 q0, q5, r3 +; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: vfma.f32 q0, q3, r4 +; CHECK-NEXT: vfma.f32 q0, q2, r12 ; CHECK-NEXT: vfma.f32 q0, q1, r10 -; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: bne .LBB19_9 ; CHECK-NEXT: @ %bb.8: @ %if.then58 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: str r5, [r6] -; CHECK-NEXT: mov r2, lr -; CHECK-NEXT: mov r4, r12 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: str r4, [r6] +; CHECK-NEXT: mov r8, lr +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_9: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov r8, s1 -; CHECK-NEXT: cmp r3, #2 +; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: vstr s1, [r6, #4] -; CHECK-NEXT: str r5, [r6] +; CHECK-NEXT: str r4, [r6] ; CHECK-NEXT: bne .LBB19_11 ; CHECK-NEXT: @ %bb.10: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r4, lr -; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, lr +; CHECK-NEXT: mov r12, r4 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_11: @ %if.else64 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 @@ -1694,7 +1689,7 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: vstr s2, [r6, #8] ; CHECK-NEXT: .LBB19_12: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: b .LBB19_2 ; CHECK-NEXT: .LBB19_13: @ %do.end ; CHECK-NEXT: add sp, #16 @@ -1901,8 +1896,8 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: ldrd r6, r12, [r0, #4] ; CHECK-NEXT: lsr.w r8, r3, #1 ; CHECK-NEXT: ldrb r0, [r0] @@ -1910,11 +1905,11 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: b .LBB20_3 ; CHECK-NEXT: .LBB20_1: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vstr s12, [r6] +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vstr s4, [r6] ; CHECK-NEXT: .LBB20_2: @ %if.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vstr s14, [r6, #4] +; CHECK-NEXT: vstr s6, [r6, #4] ; CHECK-NEXT: add.w r12, r12, #20 ; CHECK-NEXT: adds r6, #8 ; CHECK-NEXT: subs r0, #1 @@ -1923,41 +1918,39 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: .LBB20_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB20_5 Depth 2 -; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: vldrw.u32 q3, [r12] ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vshlc q4, r5, #32 -; CHECK-NEXT: vldrw.u32 q1, [r12, #8] -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldrw.u32 q2, [r12, #8] +; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vshlc q5, r5, #32 -; CHECK-NEXT: vldrw.u32 q3, [r6] -; CHECK-NEXT: vmov.f32 s14, s0 +; CHECK-NEXT: vldrw.u32 q1, [r6] +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: vmov.f32 s15, s0 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: wls lr, r8, .LBB20_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: .LBB20_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB20_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrd r7, r4, [r1], #8 -; CHECK-NEXT: vfma.f32 q6, q2, r7 -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vmov q3, q6 -; CHECK-NEXT: vfma.f32 q3, q1, r7 -; CHECK-NEXT: vstr s24, [r5] -; CHECK-NEXT: vmov.f32 s15, s0 -; CHECK-NEXT: vfma.f32 q3, q4, r4 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: vstr s13, [r5, #4] -; CHECK-NEXT: vfma.f32 q3, q5, r4 +; CHECK-NEXT: vfma.f32 q1, q3, r7 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vfma.f32 q1, q2, r7 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vfma.f32 q1, q4, r4 +; CHECK-NEXT: vmov r4, s5 +; CHECK-NEXT: vstr s5, [r5, #4] +; CHECK-NEXT: vfma.f32 q1, q5, r4 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vstr s2, [r5] ; CHECK-NEXT: adds r5, #8 -; CHECK-NEXT: vmov.f32 s12, s14 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: le lr, .LBB20_5 ; CHECK-NEXT: .LBB20_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 @@ -1966,14 +1959,14 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 ; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vfma.f32 q3, q2, r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vstr s12, [r5] -; CHECK-NEXT: vfma.f32 q3, q1, r1 -; CHECK-NEXT: vstr s13, [r6] +; CHECK-NEXT: vfma.f32 q1, q3, r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vstr s4, [r5] +; CHECK-NEXT: vfma.f32 q1, q2, r1 +; CHECK-NEXT: vstr s5, [r6] ; CHECK-NEXT: b .LBB20_2 ; CHECK-NEXT: .LBB20_8: @ %do.end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.9: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 0d86f22a321e0..b60ee7c6d406b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -1313,27 +1313,29 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 ; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: .LBB16_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q1, q5, r0 +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vadd.i32 q6, q5, r0 +; CHECK-NEXT: vmov r7, r3, d13 ; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r7, r3, d3 -; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vmov r4, r10, d12 +; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: subs.w r9, r9, #16 -; CHECK-NEXT: vmov r4, r10, d2 -; CHECK-NEXT: vadd.i32 q1, q7, lr ; CHECK-NEXT: vadd.i32 q4, q4, lr ; CHECK-NEXT: vadd.i32 q5, q5, lr +; CHECK-NEXT: vadd.i32 q7, q7, lr ; CHECK-NEXT: ldrb.w r11, [r3] ; CHECK-NEXT: ldrb r3, [r7] ; CHECK-NEXT: vmov r7, r12, d4 -; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vadd.i32 q7, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: ldrb r4, [r4] @@ -1342,7 +1344,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb.w r1, [r12] ; CHECK-NEXT: vmov.8 q0[0], r7 ; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d15 +; CHECK-NEXT: vmov r1, r7, d3 ; CHECK-NEXT: vmov.8 q0[2], r5 ; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: vmov.8 q0[4], r4 @@ -1357,8 +1359,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: ldrb.w r12, [r7] ; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d14 -; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vmov r4, r7, d2 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[8], r4 @@ -1370,7 +1371,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: vmov.8 q0[14], r3 ; CHECK-NEXT: vmov.8 q0[15], r12 ; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index eedca2cd4a5d3..c0b2da7eff41b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -236,11 +236,11 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(ptr noalias nocapture r ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vadd.i32 q3, q1, q0 +; CHECK-NEXT: vldrw.u32 q3, [r0, q1, uxtw #2] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vstrw.32 q3, [r0, q2, uxtw #2] ; CHECK-NEXT: bne .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr @@ -330,20 +330,20 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r4, .LCPI7_0 ; CHECK-NEXT: mov.w r12, #9 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vadd.i32 q2, q1, r4 -; CHECK-NEXT: vmla.i32 q3, q1, lr -; CHECK-NEXT: vmul.i32 q1, q1, r12 -; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmla.i32 q3, q2, lr ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q1, [r3] -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmul.i32 q2, q2, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 +; CHECK-NEXT: vstrw.32 q2, [r3] ; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: movs r5, #18 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w r12, #9 ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vdup.32 q2, r5 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vadd.i32 q3, q2, r4 -; CHECK-NEXT: vmla.i32 q4, q2, lr +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmla.i32 q4, q3, lr ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrw.u32 q5, [q4, #24] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmla.i32 q4, q2, r12 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmla.i32 q4, q3, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 ; CHECK-NEXT: vstrb.8 q5, [r1], #16 ; CHECK-NEXT: vstrw.32 q4, [r3] ; CHECK-NEXT: bne .LBB8_1 @@ -487,21 +487,21 @@ define dso_local void @arm_mat_mult_q31(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov q7, q2 +; CHECK-NEXT: vmov q1, q2 ; CHECK-NEXT: dls lr, r10 ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vmlas.i32 q7, q0, r7 -; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmlas.i32 q1, q0, r7 +; CHECK-NEXT: vmov q7, q4 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q0, q7, q3 -; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2] -; CHECK-NEXT: vldrw.u32 q7, [q6, #32]! -; CHECK-NEXT: vmul.i32 q1, q1, q7 -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vadd.i32 q5, q1, q5 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q3 +; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [q7, #32]! +; CHECK-NEXT: vmul.i32 q0, q0, q6 +; CHECK-NEXT: vadd.i32 q5, q0, q5 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 @@ -702,12 +702,12 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q6, q5, q3 -; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] -; CHECK-NEXT: vldrh.s32 q5, [r3], #8 -; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vadd.i32 q5, q5, q3 +; CHECK-NEXT: vldrh.s32 q7, [r1, q6, uxtw #1] +; CHECK-NEXT: vldrh.s32 q6, [r3], #8 +; CHECK-NEXT: vmul.i32 q6, q7, q6 +; CHECK-NEXT: vadd.i32 q4, q6, q4 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 @@ -922,15 +922,15 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(ptr nocapture readonly ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 -; CHECK-NEXT: vldrb.s32 q2, [r0, q5] -; CHECK-NEXT: vadd.i32 q7, q5, q0 -; CHECK-NEXT: vldrb.s32 q5, [r1, q4] -; CHECK-NEXT: vadd.i32 q6, q4, q0 -; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vmov q7, q5 +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vldrb.s32 q2, [r0, q7] +; CHECK-NEXT: vldrb.s32 q7, [r1, q6] ; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: vmlava.u32 r12, q2, q5 -; CHECK-NEXT: vmov q5, q7 -; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vadd.i32 q4, q4, q0 +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vadd.i32 q5, q5, q0 +; CHECK-NEXT: vmlava.u32 r12, q2, q7 ; CHECK-NEXT: bne .LBB11_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4 diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index 43ed5eefbf4c7..d6c5cde30ed73 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll @@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: csel r7, r6, r5, hs ; CHECK-NEXT: add.w lr, r7, #1 ; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: vldrh.u16 q2, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q2 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 ; CHECK-NEXT: vldrh.u16 q1, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 +; CHECK-NEXT: vldrh.u16 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 ; CHECK-NEXT: movs r6, #14 ; CHECK-NEXT: and.w r2, r6, r2, lsl #1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q1, [r1, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q1, [r0] +; CHECK-NEXT: vldrht.u16 q2, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 +; CHECK-NEXT: vmlsldavat.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q2, q0 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 ; CHECK-NEXT: subs r2, #8 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index 94d5490cead2f..6f2a0b2debc47 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -439,17 +439,18 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) { ; CHECK-NEXT: vmovx.f16 s1, s14 ; CHECK-NEXT: vmovx.f16 s20, s0 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vins.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmovx.f16 s21, s4 -; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s20, s2 +; CHECK-NEXT: vmovx.f16 s2, s6 ; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s8, s10 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s21, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vins.f16 s21, s2 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vadd.i16 q0, q0, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index ab41069bfa258..ecb169898f9f0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -391,17 +391,18 @@ define void @vld4_v8i16_align1(ptr %src, ptr %dst) { ; CHECK-NEXT: vmovx.f16 s1, s2 ; CHECK-NEXT: vmovx.f16 s20, s8 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s10 -; CHECK-NEXT: vins.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmovx.f16 s10, s10 ; CHECK-NEXT: vmovx.f16 s21, s12 -; CHECK-NEXT: vmovx.f16 s1, s14 +; CHECK-NEXT: vins.f16 s20, s10 +; CHECK-NEXT: vmovx.f16 s10, s14 ; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vins.f16 s21, s1 -; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vins.f16 s21, s10 ; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vins.f16 s8, s1 +; CHECK-NEXT: vmov.f32 s9, s12 ; CHECK-NEXT: vmov.f32 s11, s0 ; CHECK-NEXT: vadd.i16 q0, q2, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll index 04be18e3dd873..6656d44eec81e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll @@ -344,14 +344,14 @@ define void @loop_absmax32_pred_c(ptr %0, i32 %1, ptr nocapture %2) { ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmaxnma.f32 q1, q0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vmaxnma.f32 q0, q1 ; CHECK-NEXT: letp lr, .LBB19_1 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: vldr s0, .LCPI19_0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmaxnmav.f32 r0, q1 +; CHECK-NEXT: vldr s4, .LCPI19_0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmaxnmav.f32 r0, q0 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vstr s0, [r2] ; CHECK-NEXT: pop {r7, pc} @@ -538,14 +538,14 @@ define void @loop_absmax16_pred_c(ptr %0, i32 %1, ptr nocapture %2) { ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.16 lr, r1 ; CHECK-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #8 -; CHECK-NEXT: vmaxnma.f16 q1, q0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0], #8 +; CHECK-NEXT: vmaxnma.f16 q0, q1 ; CHECK-NEXT: letp lr, .LBB23_1 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: vldr.16 s0, .LCPI23_0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmaxnmav.f16 r0, q1 +; CHECK-NEXT: vldr.16 s4, .LCPI23_0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmaxnmav.f16 r0, q0 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 26ab555c2c593..fb5f543fd0d3a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -1055,18 +1055,18 @@ define void @vst4_v4f16(ptr %src, ptr %dst) { ; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vmovx.f16 s2, s3 ; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s7 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vmovx.f16 s2, s7 ; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vins.f16 s10, s2 ; CHECK-NEXT: vmov.f32 s9, s1 ; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vmov.f32 s7, s12 ; CHECK-NEXT: vstrh.16 q1, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll index e6fcf56af6e8d..2929a04cc0637 100644 --- a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll +++ b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll @@ -63,8 +63,8 @@ define hidden i32 @f(i32 %n) local_unnamed_addr #0 { ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: sub.w r3, r4, #16 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r5, [r3, #16]! From 2aa629da6a611be3f8aec9e1dd6d0a7f5f8f6a23 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Nov 2025 12:49:21 -0800 Subject: [PATCH 06/30] AArch64: Enable terminal rule (#165959) --- llvm/lib/Target/AArch64/AArch64Subtarget.h | 2 +- .../CodeGen/AArch64/build-vector-two-dup.ll | 10 +- .../AArch64/machine-licm-sink-instr.ll | 39 +++----- .../AArch64/machine-sink-kill-flags.ll | 5 +- ...ate-sm-changing-call-disable-coalescing.ll | 85 +++++++---------- .../sme-streaming-compatible-interface.ll | 5 +- .../sve-extract-fixed-from-scalable-vector.ll | 12 +-- .../AArch64/sve-extract-fixed-vector.ll | 39 ++++---- .../AArch64/sve-fixed-length-reshuffle.ll | 12 +-- .../AArch64/sve-fixed-length-shuffles.ll | 72 +++++++-------- .../CodeGen/AArch64/sve-ptest-removal-sink.ll | 8 +- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 92 +++++++++---------- 12 files changed, 177 insertions(+), 204 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 8974965c41fe3..ab4004e30f629 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -157,7 +157,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { return usePostRAScheduler(); } bool enableSubRegLiveness() const override { return EnableSubregLiveness; } - + bool enableTerminalRule() const override { return true; } bool enableMachinePipeliner() const override; bool useDFAforSMS() const override { return false; } diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll index dbbfbea9176f6..f725c19081deb 100644 --- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -188,11 +188,11 @@ entry: define <8 x i8> @test11(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test11: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v1.8b }, [x0] -; CHECK-NEXT: ld1r { v2.8b }, [x1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.h[2], v2.h[0] -; CHECK-NEXT: mov v0.h[3], v1.h[0] +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: fmov d2, d0 +; CHECK-NEXT: mov v0.h[2], v1.h[0] +; CHECK-NEXT: mov v0.h[3], v2.h[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll index 3230c9e946da7..b3a7ec961b736 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll @@ -20,20 +20,17 @@ define i32 @sink_load_and_copy(i32 %n) { ; CHECK-NEXT: b.lt .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: adrp x8, A -; CHECK-NEXT: mov w20, w19 -; CHECK-NEXT: ldr w21, [x8, :lo12:A] +; CHECK-NEXT: mov w21, w19 +; CHECK-NEXT: ldr w20, [x8, :lo12:A] ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov w0, w21 +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w20, w20, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB0_2 -; CHECK-NEXT: b .LBB0_4 -; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: mov w20, w19 -; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup -; CHECK-NEXT: mov w0, w20 +; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -82,15 +79,12 @@ define i32 @cant_sink_successive_call(i32 %n) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w21, w21, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB1_2 -; CHECK-NEXT: b .LBB1_4 -; CHECK-NEXT: .LBB1_3: -; CHECK-NEXT: mov w21, w19 -; CHECK-NEXT: .LBB1_4: // %for.cond.cleanup +; CHECK-NEXT: .LBB1_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -139,15 +133,12 @@ define i32 @cant_sink_successive_store(ptr nocapture readnone %store, i32 %n) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w21, w21, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB2_2 -; CHECK-NEXT: b .LBB2_4 -; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: mov w21, w19 -; CHECK-NEXT: .LBB2_4: // %for.cond.cleanup +; CHECK-NEXT: .LBB2_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll index e7e109170d6a1..338084295fc7f 100644 --- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll +++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll @@ -16,13 +16,12 @@ define i32 @test(ptr %ptr) { ; CHECK-NEXT: mov w9, wzr ; CHECK-NEXT: LBB0_1: ; %.thread ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lsr w11, w9, #1 ; CHECK-NEXT: sub w10, w9, #1 -; CHECK-NEXT: mov w9, w11 +; CHECK-NEXT: lsr w9, w9, #1 ; CHECK-NEXT: tbnz w10, #0, LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb343 ; CHECK-NEXT: and w9, w10, #0x1 -; CHECK-NEXT: mov w0, #-1 +; CHECK-NEXT: mov w0, #-1 ; =0xffffffff ; CHECK-NEXT: str w9, [x8] ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index b947c943ba448..72f6646930624 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -151,12 +151,11 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_f16 @@ -190,12 +189,11 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f32 @@ -229,12 +227,11 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_f64 @@ -273,12 +270,11 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 @@ -313,12 +309,11 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 @@ -353,12 +348,11 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 @@ -393,12 +387,11 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 @@ -433,12 +426,11 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 @@ -513,12 +505,11 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 @@ -557,12 +548,11 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 @@ -596,12 +586,11 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 @@ -635,12 +624,11 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 @@ -674,12 +662,11 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 @@ -713,12 +700,11 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 @@ -752,12 +738,11 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8bf16 @@ -791,12 +776,11 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 @@ -830,12 +814,11 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index f2163ad15bafc..df88f37195ed6 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -129,12 +129,11 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: mrs x19, SVCR -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll index 6c6a691760af3..52a77cb396909 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll @@ -147,15 +147,15 @@ define <2 x float> @extract_v2f32_nxv16f32_2( %arg) { define <4 x i1> @extract_v4i1_nxv32i1_0( %arg) { ; CHECK-LABEL: extract_v4i1_nxv32i1_0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.b[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: umov w8, v1.b[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: umov w8, v1.b[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1( %arg, i64 0) ret <4 x i1> %ext diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index e10313773c73e..72994100b2970 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -248,15 +248,15 @@ define <2 x i1> @extract_v2i1_nxv2i1( %inmask) { define <4 x i1> @extract_v4i1_nxv4i1( %inmask) { ; CHECK-LABEL: extract_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1( %inmask, i64 0) ret <4 x i1> %mask @@ -265,23 +265,23 @@ define <4 x i1> @extract_v4i1_nxv4i1( %inmask) { define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { ; CHECK-LABEL: extract_v8i1_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.b[2], w8 ; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v0.b[2], w9 -; CHECK-NEXT: umov w9, v1.h[4] ; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.h[4] +; CHECK-NEXT: mov v0.b[4], w8 ; CHECK-NEXT: umov w8, v1.h[5] -; CHECK-NEXT: mov v0.b[4], w9 -; CHECK-NEXT: umov w9, v1.h[6] ; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.h[6] +; CHECK-NEXT: mov v0.b[6], w8 ; CHECK-NEXT: umov w8, v1.h[7] -; CHECK-NEXT: mov v0.b[6], w9 ; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( %inmask, i64 0) ret <8 x i1> %mask @@ -292,9 +292,9 @@ define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { define <16 x i1> @extract_v16i1_nxv16i1( %inmask) { ; CHECK-LABEL: extract_v16i1_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.b[1], v1.b[1] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v0.b[1], v0.b[1] ; CHECK-NEXT: mov v0.b[2], v1.b[2] ; CHECK-NEXT: mov v0.b[3], v1.b[3] ; CHECK-NEXT: mov v0.b[4], v1.b[4] @@ -309,6 +309,7 @@ define <16 x i1> @extract_v16i1_nxv16i1( %inmask) { ; CHECK-NEXT: mov v0.b[13], v1.b[13] ; CHECK-NEXT: mov v0.b[14], v1.b[14] ; CHECK-NEXT: mov v0.b[15], v1.b[15] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %mask = call <16 x i1> @llvm.vector.extract.v16i1.nxv16i1( %inmask, i64 0) ret <16 x i1> %mask diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll index 41e4a38fad90b..8e807cda7166d 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll @@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i1> @reshuffle_v4i1_nxv4i1( %a) #0 { ; CHECK-LABEL: reshuffle_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %el0 = extractelement %a, i32 0 %el1 = extractelement %a, i32 1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index ba4a3a2042305..bd8f432579a08 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -28,53 +28,53 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK: // %bb.0: ; CHECK-NEXT: tbnz w1, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %vector.body +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: umov w8, v0.b[8] -; CHECK-NEXT: mov v1.b[1], v0.b[1] -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #16 +; CHECK-NEXT: umov w8, v2.b[8] +; CHECK-NEXT: mov v0.b[1], v2.b[1] +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #16 ; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v1.b[2], v0.b[2] -; CHECK-NEXT: mov v2.b[1], v0.b[9] -; CHECK-NEXT: mov v1.b[3], v0.b[3] -; CHECK-NEXT: mov v2.b[2], v0.b[10] -; CHECK-NEXT: mov v1.b[4], v0.b[4] -; CHECK-NEXT: mov v2.b[3], v0.b[11] -; CHECK-NEXT: mov v1.b[5], v0.b[5] -; CHECK-NEXT: mov v2.b[4], v0.b[12] -; CHECK-NEXT: mov v1.b[6], v0.b[6] -; CHECK-NEXT: mov v2.b[5], v0.b[13] -; CHECK-NEXT: mov v1.b[7], v0.b[7] -; CHECK-NEXT: mov v2.b[6], v0.b[14] -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: mov v2.b[7], v0.b[15] -; CHECK-NEXT: uunpklo z0.h, z3.b +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.b[2], v2.b[2] +; CHECK-NEXT: mov v1.b[1], v2.b[9] +; CHECK-NEXT: mov v0.b[3], v2.b[3] +; CHECK-NEXT: mov v1.b[2], v2.b[10] +; CHECK-NEXT: mov v0.b[4], v2.b[4] +; CHECK-NEXT: mov v1.b[3], v2.b[11] +; CHECK-NEXT: mov v0.b[5], v2.b[5] +; CHECK-NEXT: mov v1.b[4], v2.b[12] +; CHECK-NEXT: mov v0.b[6], v2.b[6] +; CHECK-NEXT: mov v1.b[5], v2.b[13] +; CHECK-NEXT: mov v0.b[7], v2.b[7] +; CHECK-NEXT: mov v1.b[6], v2.b[14] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: mov v1.b[7], v2.b[15] +; CHECK-NEXT: uunpklo z2.h, z3.b ; CHECK-NEXT: uunpklo z3.h, z4.b -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: lsl z2.s, z2.s, #31 ; CHECK-NEXT: lsl z3.s, z3.s, #31 -; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: lsl z2.s, z2.s, #31 -; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0 -; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: cmpne p3.s, p0/z, z0.s, #0 +; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, #0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 -; CHECK-NEXT: asr z2.s, z2.s, #31 -; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 -; CHECK-NEXT: st1w { z0.s }, p1, [x0, #2, mul vl] -; CHECK-NEXT: st1w { z0.s }, p2, [x0, #3, mul vl] -; CHECK-NEXT: st1w { z0.s }, p3, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x0, #1, mul vl] +; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: st1w { z2.s }, p1, [x0, #2, mul vl] +; CHECK-NEXT: st1w { z2.s }, p2, [x0, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p3, [x0] +; CHECK-NEXT: st1w { z2.s }, p0, [x0, #1, mul vl] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll index 124f81e7864d1..39fe92aae0619 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll @@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) { ; CHECK-NEXT: whilelt p0.s, wzr, w0 ; CHECK-NEXT: b.pl .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: cntw x9 +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: cntw x8 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: whilelt p0.s, w8, w0 -; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: whilelt p0.s, w9, w0 +; CHECK-NEXT: add w9, w9, w8 ; CHECK-NEXT: b.mi .LBB0_2 ; CHECK-NEXT: .LBB0_3: // %exit ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 74a717f1635a3..935189dec48ac 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2835,11 +2835,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB24_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -2847,11 +2847,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB24_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -2950,26 +2950,26 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB25_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x0] -; CHECK-BE-NEXT: add x9, x1, #48 -; CHECK-BE-NEXT: add x8, x1, #32 -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] +; CHECK-BE-NEXT: add x10, x1, #48 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x1] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] ; CHECK-BE-NEXT: add x1, x1, #16 -; CHECK-BE-NEXT: ld1 { v20.4s }, [x8] +; CHECK-BE-NEXT: ld1 { v20.4s }, [x9] ; CHECK-BE-NEXT: ld1 { v22.4s }, [x1] -; CHECK-BE-NEXT: add x8, x0, #96 +; CHECK-BE-NEXT: add x9, x0, #96 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v2.16b ; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b ; CHECK-BE-NEXT: ext v24.16b, v18.16b, v18.16b, #8 -; CHECK-BE-NEXT: add x9, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ext v25.16b, v20.16b, v20.16b, #8 -; CHECK-BE-NEXT: add x10, x0, #16 +; CHECK-BE-NEXT: add x10, x0, #32 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v5.8b, v5.8b +; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v21.8b, v7.8b ; CHECK-BE-NEXT: rev32 v23.8b, v4.8b ; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8 @@ -2986,22 +2986,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s ; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: st1 { v5.2d }, [x9] ; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s ; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s -; CHECK-BE-NEXT: add x8, x0, #112 +; CHECK-BE-NEXT: add x9, x0, #112 ; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s -; CHECK-BE-NEXT: st1 { v18.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #80 +; CHECK-BE-NEXT: st1 { v18.2d }, [x10] +; CHECK-BE-NEXT: add x10, x0, #80 ; CHECK-BE-NEXT: st1 { v22.2d }, [x0] -; CHECK-BE-NEXT: st1 { v17.2d }, [x8] -; CHECK-BE-NEXT: add x8, x0, #64 -; CHECK-BE-NEXT: st1 { v19.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: mov x0, x8 -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: add x0, x0, #64 +; CHECK-BE-NEXT: st1 { v17.2d }, [x9] +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: st1 { v19.2d }, [x10] +; CHECK-BE-NEXT: st1 { v5.2d }, [x0] ; CHECK-BE-NEXT: st1 { v6.2d }, [x9] -; CHECK-BE-NEXT: st1 { v4.2d }, [x10] +; CHECK-BE-NEXT: st1 { v4.2d }, [x8] ; CHECK-BE-NEXT: b.ne .LBB25_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3093,13 +3093,14 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB26_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 +; CHECK-BE-NEXT: add x9, x0, #32 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x0] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #16 -; CHECK-BE-NEXT: ld1 { v17.4s }, [x8] -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] -; CHECK-BE-NEXT: ld1 { v19.4s }, [x10] +; CHECK-BE-NEXT: add x10, x0, #48 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: ld1 { v17.4s }, [x9] +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] +; CHECK-BE-NEXT: ld1 { v19.4s }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v3.16b @@ -3113,11 +3114,10 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: mul v6.4s, v17.4s, v6.4s ; CHECK-BE-NEXT: mul v7.4s, v18.4s, v7.4s ; CHECK-BE-NEXT: mul v4.4s, v19.4s, v4.4s -; CHECK-BE-NEXT: st1 { v5.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x10 -; CHECK-BE-NEXT: st1 { v6.4s }, [x8] -; CHECK-BE-NEXT: st1 { v7.4s }, [x9] -; CHECK-BE-NEXT: st1 { v4.4s }, [x10] +; CHECK-BE-NEXT: st1 { v5.4s }, [x8] +; CHECK-BE-NEXT: st1 { v6.4s }, [x9] +; CHECK-BE-NEXT: st1 { v7.4s }, [x10] +; CHECK-BE-NEXT: st1 { v4.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB26_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3246,11 +3246,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB28_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -3258,11 +3258,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: smull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: smull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: smull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB28_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr From 067f1556d0ec5a38fdc8b86596d0e3544f4e6239 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Mon, 10 Nov 2025 15:50:51 -0500 Subject: [PATCH 07/30] [AMDGPU] remove clamp and omod for trans bf16 insts (#165819) trans bf16 insts do not support clamp and omod --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 39 ++++++-- llvm/lib/Target/AMDGPU/VOPInstructions.td | 8 +- llvm/test/CodeGen/AMDGPU/bf16-math.ll | 5 +- llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s | 80 ++++++++++++++++ .../gfx1250_asm_vop3_from_vop1-fake16.s | 72 -------------- .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 72 -------------- .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 96 ------------------- .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 96 ------------------- .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 96 ------------------- .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 96 ------------------- .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 96 ------------------- .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 96 ------------------- .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 96 ------------------- 13 files changed, 121 insertions(+), 827 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 54f57e02ed47e..85adcab55b742 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -513,6 +513,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16", defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>; + +let HasClamp = 0, HasOMod = 0 in { +def V_TRANS_BF16_Profile : VOPProfile <[bf16, bf16, untyped, untyped]>; +def V_TRANS_BF16_t16_Profile : VOPProfile_True16 ; +def V_TRANS_BF16_fake16_Profile : VOPProfile_Fake16 ; +} + let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; @@ -527,14 +534,30 @@ defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; } let SubtargetPredicate = HasBF16TransInsts in { -defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; -defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; -defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; -defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; -defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; -defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; -defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; -defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; +defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + int_amdgcn_tanh>; +defm V_RCP_BF16 : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrcp>; +defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUcos>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 8325c628d68d6..6df91a1d438b0 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1357,8 +1357,12 @@ class VOPBinOpClampPat : class getVOP3ModPat { dag src0 = !if(P.HasOMod, - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)), + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers))); list ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp.ret (P.Src0VT src0), diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 30a78648c186a..39618b05e6c71 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -368,7 +368,10 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) { define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) { ; GCN-LABEL: test_clamp_bf16_folding: ; GCN: ; %bb.0: -; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp +; GCN-NEXT: v_exp_bf16_e32 v0, v0 +; GCN-NEXT: v_nop +; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp ; GCN-NEXT: ; return to shader part epilog %exp = call bfloat @llvm.exp2.bf16(bfloat %src) %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0) diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s index c393d3e819880..3f6d8feb45df0 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s @@ -34,3 +34,83 @@ v_cvt_f32_bf16 v5, v1 div:2 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // GFX1250-ERR-NEXT:{{^}}v_cvt_f32_bf16 v5, v1 div:2 // GFX1250-ERR-NEXT:{{^}} ^ + +v_cos_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_cos_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_exp_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_exp_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_log_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_log_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rcp_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rcp_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rsq_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rsq_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sin_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sin_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sqrt_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sqrt_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_tanh_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_tanh_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 0931523bbf40c..37ad6eb249da4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -3781,15 +3781,6 @@ v_tanh_bf16_e64 v5, null v_tanh_bf16_e64 v5, -1 // GFX1250: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - -v_tanh_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - -v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_prng_b32_e64 v5, v1 // GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] @@ -3862,15 +3853,6 @@ v_rcp_bf16_e64 v5, null v_rcp_bf16_e64 v5, -1 // GFX1250: v_rcp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -v_rcp_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_rcp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - -v_rcp_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_rcp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - -v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sqrt_bf16_e64 v5, v1 // GFX1250: v_sqrt_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00] @@ -3907,15 +3889,6 @@ v_sqrt_bf16_e64 v5, null v_sqrt_bf16_e64 v5, -1 // GFX1250: v_sqrt_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -v_sqrt_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_sqrt_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - -v_sqrt_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_sqrt_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - -v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rsq_bf16_e64 v5, v1 // GFX1250: v_rsq_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00] @@ -3952,15 +3925,6 @@ v_rsq_bf16_e64 v5, null v_rsq_bf16_e64 v5, -1 // GFX1250: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -v_rsq_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - -v_rsq_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - -v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_log_bf16_e64 v5, v1 // GFX1250: v_log_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00] @@ -3997,15 +3961,6 @@ v_log_bf16_e64 v5, null v_log_bf16_e64 v5, -1 // GFX1250: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -v_log_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - -v_log_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - -v_log_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_exp_bf16_e64 v5, v1 // GFX1250: v_exp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00] @@ -4042,15 +3997,6 @@ v_exp_bf16_e64 v5, null v_exp_bf16_e64 v5, -1 // GFX1250: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -v_exp_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - -v_exp_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - -v_exp_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sin_bf16_e64 v5, v1 // GFX1250: v_sin_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00] @@ -4087,15 +4033,6 @@ v_sin_bf16_e64 v5, null v_sin_bf16_e64 v5, -1 // GFX1250: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -v_sin_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - -v_sin_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - -v_sin_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cos_bf16_e64 v5, v1 // GFX1250: v_cos_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00] @@ -4132,15 +4069,6 @@ v_cos_bf16_e64 v5, null v_cos_bf16_e64 v5, -1 // GFX1250: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -v_cos_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - -v_cos_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - -v_cos_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 5ac9eb47381d6..52f9ba3a99483 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -3952,15 +3952,6 @@ v_tanh_bf16_e64 v5.l, null v_tanh_bf16_e64 v5.l, -1 // GFX1250: v_tanh_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - -v_tanh_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - -v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_tanh_bf16 v5.l, v128.h // GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00] @@ -4036,15 +4027,6 @@ v_rcp_bf16_e64 v5.l, null v_rcp_bf16_e64 v5.l, -1 // GFX1250: v_rcp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -v_rcp_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_rcp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - -v_rcp_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_rcp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - -v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rcp_bf16 v5.h, v128.h // GFX1250: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00] @@ -4084,15 +4066,6 @@ v_sqrt_bf16_e64 v5.l, null v_sqrt_bf16_e64 v5.l, -1 // GFX1250: v_sqrt_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -v_sqrt_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_sqrt_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - -v_sqrt_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_sqrt_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - -v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sqrt_bf16 v5.h, v128.h // GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] @@ -4132,15 +4105,6 @@ v_rsq_bf16_e64 v5.l, null v_rsq_bf16_e64 v5.l, -1 // GFX1250: v_rsq_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -v_rsq_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_rsq_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - -v_rsq_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_rsq_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - -v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rsq_bf16 v5.h, v128.h // GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] @@ -4180,15 +4144,6 @@ v_log_bf16_e64 v5.l, null v_log_bf16_e64 v5.l, -1 // GFX1250: v_log_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -v_log_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_log_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - -v_log_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_log_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - -v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_log_bf16 v5.h, v128.h // GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] @@ -4228,15 +4183,6 @@ v_exp_bf16_e64 v5.l, null v_exp_bf16_e64 v5.l, -1 // GFX1250: v_exp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -v_exp_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_exp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - -v_exp_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_exp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - -v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_exp_bf16 v5.h, v128.h // GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] @@ -4276,15 +4222,6 @@ v_sin_bf16_e64 v5.l, null v_sin_bf16_e64 v5.l, -1 // GFX1250: v_sin_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -v_sin_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_sin_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - -v_sin_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_sin_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - -v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sin_bf16 v5.h, v128.h // GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] @@ -4324,15 +4261,6 @@ v_cos_bf16_e64 v5.l, null v_cos_bf16_e64 v5.l, -1 // GFX1250: v_cos_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -v_cos_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_cos_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - -v_cos_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_cos_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - -v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cos_bf16_e64 v5.h, v128.h // GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index b21fca654590a..21077fe4f9f05 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -158,18 +158,6 @@ v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -258,18 +246,6 @@ v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -314,18 +290,6 @@ v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -370,18 +334,6 @@ v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -426,18 +378,6 @@ v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -482,18 +422,6 @@ v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -538,18 +466,6 @@ v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -594,18 +510,6 @@ v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index d1638565a386a..646acf5219d7e 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -162,18 +162,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -266,18 +254,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rcp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -326,18 +302,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -386,18 +350,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -446,18 +398,6 @@ v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -506,18 +446,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -566,18 +494,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -626,18 +542,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index 78afa10b984cb..1907a939b488b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -38,18 +38,6 @@ v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -58,114 +46,30 @@ v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 6ec4d5f48f8b1..35a51dbe9f922 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -42,18 +42,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -66,18 +54,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rcp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -86,18 +62,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -106,18 +70,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -126,18 +78,6 @@ v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -146,18 +86,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -166,18 +94,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -186,18 +102,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 67747a65ee52a..0b393973b7875 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -4123,18 +4123,10 @@ # GFX1250-REAL16: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_tanh_f16_e64 v5, v128 ; encoding: [0x05,0x00,0x9f,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] @@ -4159,10 +4151,6 @@ # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] @@ -4223,18 +4211,10 @@ 0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00 # GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00] -0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_rcp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00] @@ -4259,10 +4239,6 @@ # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_rcp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00] @@ -4287,18 +4263,10 @@ # GFX1250-REAL16: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xf9,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00] @@ -4323,10 +4291,6 @@ # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00] @@ -4351,18 +4315,10 @@ # GFX1250-REAL16: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfa,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] @@ -4387,10 +4343,6 @@ # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] @@ -4415,18 +4367,10 @@ # GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_log_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] @@ -4451,10 +4395,6 @@ # GFX1250-REAL16: v_log_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_log_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] @@ -4479,18 +4419,10 @@ # GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_exp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] @@ -4515,10 +4447,6 @@ # GFX1250-REAL16: v_exp_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_exp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] @@ -4543,18 +4471,10 @@ # GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_sin_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] @@ -4579,10 +4499,6 @@ # GFX1250-REAL16: v_sin_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_sin_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] @@ -4607,18 +4523,10 @@ # GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_cos_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] @@ -4643,10 +4551,6 @@ # GFX1250-REAL16: v_cos_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_cos_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 7c29f8ab01a1b..8b26d2a8696e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -104,18 +104,6 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -197,18 +185,6 @@ 0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff # GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -257,18 +233,6 @@ # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -317,18 +281,6 @@ # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -377,18 +329,6 @@ # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -437,18 +377,6 @@ # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -497,18 +425,6 @@ # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -557,18 +473,6 @@ # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index d26bc46a1f272..15f76c54a1c65 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -34,22 +34,10 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] @@ -57,142 +45,58 @@ 0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] From 826cadd5628413a0a92722e0382ba9d968911acc Mon Sep 17 00:00:00 2001 From: Fateme Hosseini Date: Mon, 10 Nov 2025 14:55:07 -0600 Subject: [PATCH 08/30] [Hexagon] Clean-up Instrprof test (#166990) --- llvm/test/CodeGen/Hexagon/instrprof-custom.ll | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll index 620b2acc49520..1c1965d44541f 100644 --- a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll +++ b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=hexagon -relocation-model=pic < %s | FileCheck %s -; RUN: llc -mtriple=hexagon < %s | FileCheck %s +; RUN: llc -mtriple=hexagon --mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp < %s | FileCheck %s ; CHECK-LABEL: test1: ; CHECK: {{call my_instrprof_handler|r0 = #999}} @@ -14,7 +14,4 @@ entry: } ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn -declare void @llvm.hexagon.instrprof.custom(ptr, i32) #1 - -attributes #0 = { "target-features"="+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp,+hmxv68" } -attributes #1 = { inaccessiblememonly nofree nosync nounwind willreturn } +declare void @llvm.hexagon.instrprof.custom(ptr, i32) From 17e26411f8b8a404e07a628ba17986a2392c1f79 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Mon, 10 Nov 2025 14:56:59 -0600 Subject: [PATCH 09/30] [bazel][clang] Port #167374: split clang options/driver (#167387) --- .../llvm-project-overlay/clang/BUILD.bazel | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 36b71dd49ef13..deb56dc0957e9 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1231,6 +1231,7 @@ cc_library( ":format", ":frontend", ":lex", + ":options", ":rewrite", ":support", ":tooling_core", @@ -1507,6 +1508,19 @@ gentbl_cc_library( deps = ["//llvm:OptParserTdFiles"], ) +cc_library( + name = "options", + srcs = glob(["lib/Options/*.cpp"]), + hdrs = glob(["include/clang/Options/*.h"]), + includes = ["include"], + deps = [ + ":basic", + ":driver_options_inc_gen", + ":static_analyzer_checkers_gen", + "//llvm:Option", + ], +) + cc_library( name = "driver", srcs = glob( @@ -1544,6 +1558,7 @@ cc_library( ":config", ":driver_options_inc_gen", ":lex", + ":options", ":parse", ":static_analyzer_checkers_gen", "//llvm:BinaryFormat", @@ -1700,6 +1715,7 @@ cc_library( ":driver_options_inc_gen", ":edit", ":lex", + ":options", ":parse", ":sema", ":serialization", @@ -1769,6 +1785,7 @@ cc_library( ":frontend", ":frontend_tool", ":lex", + ":options", ":parse", ":sema", ":serialization", @@ -2001,6 +2018,7 @@ cc_library( ":extract_api", ":frontend", ":frontend_rewrite", + ":options", ":static_analyzer_frontend", "//llvm:Option", "//llvm:Support", @@ -2176,6 +2194,7 @@ cc_library( ":frontend_rewrite", ":frontend_tool", ":lex", + ":options", ":parse", ":sema", ":serialization", @@ -2258,6 +2277,7 @@ cc_binary( ":driver", ":frontend", ":frontend_rewrite", + ":options", ":serialization", ":static_analyzer_frontend", ":tooling", From a37c4e0fad5289e5fdd017ea5ac6162d71fb73ae Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 10 Nov 2025 13:03:20 -0800 Subject: [PATCH 10/30] [NFC][SpecialCaseList] Hide Section internals in private section (#167276) Preparing to moving most of implementation out of the header file. * https://github.com/llvm/llvm-project/pull/167280 --------- Co-authored-by: Naveen Seth Hanig Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- clang/lib/Basic/Diagnostic.cpp | 2 +- clang/lib/Basic/ProfileList.cpp | 2 +- clang/lib/Basic/SanitizerSpecialCaseList.cpp | 4 ++-- llvm/include/llvm/Support/SpecialCaseList.h | 23 +++++++++++++++----- llvm/lib/Support/SpecialCaseList.cpp | 8 +++++++ 5 files changed, 30 insertions(+), 9 deletions(-) diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index 2dec26ecacf26..5e9da245e2b43 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -534,7 +534,7 @@ WarningsSpecialCaseList::create(const llvm::MemoryBuffer &Input, void WarningsSpecialCaseList::processSections(DiagnosticsEngine &Diags) { static constexpr auto WarningFlavor = clang::diag::Flavor::WarningOrError; for (const auto &SectionEntry : sections()) { - StringRef DiagGroup = SectionEntry.SectionStr; + StringRef DiagGroup = SectionEntry.name(); if (DiagGroup == "*") { // Drop the default section introduced by special case list, we only // support exact diagnostic group names. diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp index 9cb118893a0d9..8727057eb78d1 100644 --- a/clang/lib/Basic/ProfileList.cpp +++ b/clang/lib/Basic/ProfileList.cpp @@ -36,7 +36,7 @@ class ProfileSpecialCaseList : public llvm::SpecialCaseList { bool hasPrefix(StringRef Prefix) const { for (const auto &It : sections()) - if (It.Entries.count(Prefix) > 0) + if (It.hasPrefix(Prefix)) return true; return false; } diff --git a/clang/lib/Basic/SanitizerSpecialCaseList.cpp b/clang/lib/Basic/SanitizerSpecialCaseList.cpp index 56f551628cf89..928c086898097 100644 --- a/clang/lib/Basic/SanitizerSpecialCaseList.cpp +++ b/clang/lib/Basic/SanitizerSpecialCaseList.cpp @@ -42,7 +42,7 @@ void SanitizerSpecialCaseList::createSanitizerSections() { SanitizerMask Mask; #define SANITIZER(NAME, ID) \ - if (S.SectionMatcher.matchAny(NAME)) \ + if (S.matchName(NAME)) \ Mask |= SanitizerKind::ID; #define SANITIZER_GROUP(NAME, ID, ALIAS) SANITIZER(NAME, ID) @@ -68,7 +68,7 @@ SanitizerSpecialCaseList::inSectionBlame(SanitizerMask Mask, StringRef Prefix, if (S.Mask & Mask) { unsigned LineNum = S.S.getLastMatch(Prefix, Query, Category); if (LineNum > 0) - return {S.S.FileIdx, LineNum}; + return {S.S.fileIndex(), LineNum}; } } return NotFound; diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h index cb8e568de02e0..dcc68b73d0675 100644 --- a/llvm/include/llvm/Support/SpecialCaseList.h +++ b/llvm/include/llvm/Support/SpecialCaseList.h @@ -201,17 +201,22 @@ class SpecialCaseList { using SectionEntries = StringMap>; protected: - struct Section { + class Section { + public: Section(StringRef Str, unsigned FileIdx, bool UseGlobs) : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false), SectionStr(Str), FileIdx(FileIdx) {} Section(Section &&) = default; - Matcher SectionMatcher; - SectionEntries Entries; - std::string SectionStr; - unsigned FileIdx; + // Return name of the section, its entire string in []. + StringRef name() const { return SectionStr; } + + // Returns true if string 'Name' matches section name interpreted as a glob. + LLVM_ABI bool matchName(StringRef Name) const; + + // Return sequence number of the file where this section is defined. + unsigned fileIndex() const { return FileIdx; } // Helper method to search by Prefix, Query, and Category. Returns // 1-based line number on which rule is defined, or 0 if there is no match. @@ -223,11 +228,19 @@ class SpecialCaseList { LLVM_ABI StringRef getLongestMatch(StringRef Prefix, StringRef Query, StringRef Category) const; + /// Returns true if the section has any entries for the given prefix. + LLVM_ABI bool hasPrefix(StringRef Prefix) const; + private: friend class SpecialCaseList; LLVM_ABI void preprocess(bool OrderBySize); LLVM_ABI const SpecialCaseList::Matcher * findMatcher(StringRef Prefix, StringRef Category) const; + + Matcher SectionMatcher; + std::string SectionStr; + SectionEntries Entries; + unsigned FileIdx; }; ArrayRef sections() const { return Sections; } diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 246d90cce3a43..465e10ae87588 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -348,6 +348,10 @@ SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix, return NotFound; } +bool SpecialCaseList::Section::matchName(StringRef Name) const { + return SectionMatcher.matchAny(Name); +} + const SpecialCaseList::Matcher * SpecialCaseList::Section::findMatcher(StringRef Prefix, StringRef Category) const { @@ -393,4 +397,8 @@ StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix, return LongestRule; } +bool SpecialCaseList::Section::hasPrefix(StringRef Prefix) const { + return Entries.find(Prefix) != Entries.end(); +} + } // namespace llvm From b4a61517a64c6e65294e61a23276c87119f73667 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Mon, 10 Nov 2025 22:04:49 +0100 Subject: [PATCH 11/30] [CIR][NFC] Re-land: Add test for Complex imag literal GNU extension (#167383) Re-land: Add test for Complex imag literal GNU extension after updating the name --- clang/test/CIR/CodeGen/complex.cpp | 39 ++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp index 3fb78dc871904..4eab3999dfc42 100644 --- a/clang/test/CIR/CodeGen/complex.cpp +++ b/clang/test/CIR/CodeGen/complex.cpp @@ -1495,3 +1495,42 @@ void calling_function_that_return_complex() { // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 // OGCG: store float %[[RESULT_REAL]], ptr %[[A_REAL_PTR]], align 4 // OGCG: store float %[[RESULT_IMAG]], ptr %[[A_IMAG_PTR]], align 4 + +void imag_literal_gnu_extension() { + float _Complex a = 3.0fi; + double _Complex b = 3.0i; + int _Complex c = 3i; +} + +// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a", init] +// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["c", init] +// CIR: %[[COMPLEX_A:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.float, #cir.fp<3.000000e+00> : !cir.float> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX_A]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR: %[[COMPLEX_B:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.double, #cir.fp<3.000000e+00> : !cir.double> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX_B]], %[[B_ADDR]] : !cir.complex, !cir.ptr> +// CIR: %[[COMPLEX_C:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<3> : !s32i> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX_C]], %[[C_ADDR]] : !cir.complex, !cir.ptr> + +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8 +// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4 +// LLVM: store { float, float } { float 0.000000e+00, float 3.000000e+00 }, ptr %[[A_ADDR]], align 4 +// LLVM: store { double, double } { double 0.000000e+00, double 3.000000e+00 }, ptr %[[B_ADDR]], align 8 +// LLVM: store { i32, i32 } { i32 0, i32 3 }, ptr %[[C_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8 +// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: store float 0.000000e+00, ptr %[[A_REAL_PTR]], align 4 +// OGCG: store float 3.000000e+00, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0 +// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1 +// OGCG: store double 0.000000e+00, ptr %[[B_REAL_PTR]], align 8 +// OGCG: store double 3.000000e+00, ptr %[[B_IMAG_PTR]], align 8 +// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0 +// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1 +// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4 +// OGCG: store i32 3, ptr %[[C_IMAG_PTR]], align 4 From 540250ca7a58e9fa11d02b0e894166b0e4723f70 Mon Sep 17 00:00:00 2001 From: Kevin Sala Penades Date: Mon, 10 Nov 2025 13:11:03 -0800 Subject: [PATCH 12/30] [OpenMP][Clang] Add codegen support for dyn_groupprivate clause (#152830) This adds the codegen support for the dyn_groupprivate clause. --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 51 +- .../target_dyn_groupprivate_codegen.cpp | 2633 +++++++++++++++++ .../llvm/Frontend/OpenMP/OMPConstants.h | 10 + .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 23 +- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 30 +- 5 files changed, 2718 insertions(+), 29 deletions(-) create mode 100644 clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 44ba72c5c76c7..1224fa681cdc0 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -10013,19 +10013,44 @@ static llvm::Value *emitDeviceID( return DeviceID; } -static llvm::Value *emitDynCGGroupMem(const OMPExecutableDirective &D, - CodeGenFunction &CGF) { - llvm::Value *DynCGroupMem = CGF.Builder.getInt32(0); - - if (auto *DynMemClause = D.getSingleClause()) { - CodeGenFunction::RunCleanupsScope DynCGroupMemScope(CGF); - llvm::Value *DynCGroupMemVal = CGF.EmitScalarExpr( - DynMemClause->getSize(), /*IgnoreResultAssign=*/true); - DynCGroupMem = CGF.Builder.CreateIntCast(DynCGroupMemVal, CGF.Int32Ty, - /*isSigned=*/false); +static std::pair +emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) { + llvm::Value *DynGP = CGF.Builder.getInt32(0); + auto DynGPFallback = OMPDynGroupprivateFallbackType::Abort; + + if (auto *DynGPClause = D.getSingleClause()) { + CodeGenFunction::RunCleanupsScope DynGPScope(CGF); + llvm::Value *DynGPVal = + CGF.EmitScalarExpr(DynGPClause->getSize(), /*IgnoreResultAssign=*/true); + DynGP = CGF.Builder.CreateIntCast(DynGPVal, CGF.Int32Ty, + /*isSigned=*/false); + auto FallbackModifier = DynGPClause->getDynGroupprivateFallbackModifier(); + switch (FallbackModifier) { + case OMPC_DYN_GROUPPRIVATE_FALLBACK_abort: + DynGPFallback = OMPDynGroupprivateFallbackType::Abort; + break; + case OMPC_DYN_GROUPPRIVATE_FALLBACK_null: + DynGPFallback = OMPDynGroupprivateFallbackType::Null; + break; + case OMPC_DYN_GROUPPRIVATE_FALLBACK_default_mem: + case OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown: + // This is the default for dyn_groupprivate. + DynGPFallback = OMPDynGroupprivateFallbackType::DefaultMem; + break; + default: + llvm_unreachable("Unknown fallback modifier for OpenMP dyn_groupprivate"); + } + } else if (auto *OMPXDynCGClause = + D.getSingleClause()) { + CodeGenFunction::RunCleanupsScope DynCGMemScope(CGF); + llvm::Value *DynCGMemVal = CGF.EmitScalarExpr(OMPXDynCGClause->getSize(), + /*IgnoreResultAssign=*/true); + DynGP = CGF.Builder.CreateIntCast(DynCGMemVal, CGF.Int32Ty, + /*isSigned=*/false); } - return DynCGroupMem; + return {DynGP, DynGPFallback}; } + static void genMapInfoForCaptures( MappableExprsHandler &MEHandler, CodeGenFunction &CGF, const CapturedStmt &CS, llvm::SmallVectorImpl &CapturedVars, @@ -10234,7 +10259,7 @@ static void emitTargetCallKernelLaunch( llvm::Value *RTLoc = OMPRuntime->emitUpdateLocation(CGF, D.getBeginLoc()); llvm::Value *NumIterations = OMPRuntime->emitTargetNumIterationsCall(CGF, D, SizeEmitter); - llvm::Value *DynCGGroupMem = emitDynCGGroupMem(D, CGF); + auto [DynCGroupMem, DynCGroupMemFallback] = emitDynCGroupMem(D, CGF); llvm::OpenMPIRBuilder::InsertPointTy AllocaIP( CGF.AllocaInsertPt->getParent(), CGF.AllocaInsertPt->getIterator()); @@ -10244,7 +10269,7 @@ static void emitTargetCallKernelLaunch( llvm::OpenMPIRBuilder::TargetKernelArgs Args( NumTargetItems, RTArgs, NumIterations, NumTeams, NumThreads, - DynCGGroupMem, HasNoWait); + DynCGroupMem, HasNoWait, DynCGroupMemFallback); llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(OMPRuntime->getOMPBuilder().emitKernelLaunch( diff --git a/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp new file mode 100644 index 0000000000000..758f35d629ace --- /dev/null +++ b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp @@ -0,0 +1,2633 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3 + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK9 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK11 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK11 + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" + + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + + + + +// We have 6 target regions + + + +// Check target registration is registered as a Ctor. + + +template +tx ftemplate(int n) { + tx a = 0; + + #pragma omp target teams dyn_groupprivate(tx(20)) + { + } + + short b = 1; + #pragma omp target teams num_teams(b) dyn_groupprivate(1024) + { + a += b; + } + + return a; +} + +static +int fstatic(int n) { + + #pragma omp target teams distribute parallel for simd num_teams(n) dyn_groupprivate(n*32) + for (int i = 0; i < n ; ++i) { + } + + #pragma omp target teams dyn_groupprivate(fallback(default_mem): 32+n) nowait + { + } + + return n+1; +} + +struct S1 { + double a; + + int r1(int n){ + int b = 1; + + #pragma omp target teams dyn_groupprivate(fallback(null): n-b) + { + this->a = (double)b + 1.5; + } + + #pragma omp target dyn_groupprivate(fallback(abort): 1024) + { + this->a = 2.5; + } + + return (int)a; + } +}; + +int bar(int n){ + int a = 0; + + S1 S; + a += S.r1(n); + + a += fstatic(n); + + a += ftemplate(n); + + return a; +} + + + + + + + + + + + + + + + + + + + + + +// Check that the offloading functions are emitted and that the parallel function +// is appropriately guarded. + + + + + + +#endif + +// CHECK1-LABEL: define {{[^@]+}}@_Z3bari +// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8 +// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(ptr noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP0]]) +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[CALL1:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP2]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]] +// CHECK1-NEXT: store i32 [[ADD2]], ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[CALL3:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP4]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]] +// CHECK1-NEXT: store i32 [[ADD4]], ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: ret i32 [[TMP6]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@_ZN2S12r1Ei +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i32 1, ptr [[B]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]] +// CHECK1-NEXT: store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[B]], align 4 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[B_CASTED]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: store ptr [[THIS1]], ptr [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: store ptr [[A]], ptr [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP3]], ptr [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP3]], ptr [[TMP10]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP5]], ptr [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP5]], ptr [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP18]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK1-NEXT: store i32 3, ptr [[TMP19]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP16]], ptr [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK1-NEXT: store i64 4, ptr [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK1-NEXT: store i32 [[TMP17]], ptr [[TMP30]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]]) +// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK1-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK1: omp_offload.failed: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK1: omp_offload.cont: +// CHECK1-NEXT: [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK1-NEXT: store ptr [[THIS1]], ptr [[TMP33]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK1-NEXT: store ptr [[A2]], ptr [[TMP34]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP35]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP38]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK1-NEXT: store i32 1, ptr [[TMP39]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP36]], ptr [[TMP40]], align 8 +// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP37]], ptr [[TMP41]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP42]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP43]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP44]], align 8 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP45]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP46]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK1-NEXT: store i64 0, ptr [[TMP47]], align 8 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4 +// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK1-NEXT: store i32 1024, ptr [[TMP50]], align 4 +// CHECK1-NEXT: [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK1-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK1-NEXT: br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK1: omp_offload.failed7: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK1: omp_offload.cont8: +// CHECK1-NEXT: [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP53:%.*]] = load double, ptr [[A9]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = fptosi double [[TMP53]] to i32 +// CHECK1-NEXT: ret i32 [[CONV]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@_ZL7fstatici +// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32 +// CHECK1-NEXT: store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP4]], ptr [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP4]], ptr [[TMP10]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP6]], ptr [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP6]], ptr [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP8]], ptr [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP8]], ptr [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CHECK1-NEXT: store ptr null, ptr [[TMP17]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK1-NEXT: [[TMP24:%.*]] = zext i32 [[ADD]] to i64 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP27]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK1-NEXT: store i32 3, ptr [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP18]], ptr [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP19]], ptr [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK1-NEXT: store ptr @.offload_sizes.3, ptr [[TMP31]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP32]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP34]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK1-NEXT: store i64 [[TMP24]], ptr [[TMP35]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK1-NEXT: store i64 8, ptr [[TMP36]], align 8 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK1-NEXT: store i32 [[TMP25]], ptr [[TMP39]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]]) +// CHECK1-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK1: omp_offload.failed: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK1: omp_offload.cont: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 32, [[TMP42]] +// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4 +// CHECK1-NEXT: [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 8 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP44]], ptr [[TMP45]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP44]], ptr [[TMP46]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP47]], align 8 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: store i32 [[TMP51]], ptr [[TMP50]], align 4 +// CHECK1-NEXT: [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 64, i64 4, ptr @.omp_task_entry., i64 -1) +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 8 +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i64 4, i1 false) +// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0 +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP57]], ptr align 8 [[TMP48]], i64 8, i1 false) +// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1 +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP58]], ptr align 8 [[TMP49]], i64 8, i1 false) +// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2 +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP59]], ptr align 8 @.offload_sizes.5, i64 8, i1 false) +// CHECK1-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]]) +// CHECK1-NEXT: [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1 +// CHECK1-NEXT: ret i32 [[ADD12]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i +// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK1-NEXT: [[B:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK1-NEXT: store i64 8, ptr [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK1-NEXT: store i32 20, ptr [[TMP12]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]]) +// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK1: omp_offload.failed: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK1: omp_offload.cont: +// CHECK1-NEXT: store i16 1, ptr [[B]], align 2 +// CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[B]], align 2 +// CHECK1-NEXT: store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: store i32 [[TMP16]], ptr [[A_CASTED]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[A_CASTED]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i16, ptr [[B]], align 2 +// CHECK1-NEXT: store i16 [[TMP18]], ptr [[B_CASTED]], align 2 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK1-NEXT: store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP19]], ptr [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP19]], ptr [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CHECK1-NEXT: store ptr null, ptr [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP21]], ptr [[TMP28]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP21]], ptr [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CHECK1-NEXT: store ptr null, ptr [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK1-NEXT: [[TMP34:%.*]] = sext i16 [[TMP33]] to i32 +// CHECK1-NEXT: [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP36]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1 +// CHECK1-NEXT: store i32 3, ptr [[TMP37]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP31]], ptr [[TMP38]], align 8 +// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP32]], ptr [[TMP39]], align 8 +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4 +// CHECK1-NEXT: store ptr @.offload_sizes.7, ptr [[TMP40]], align 8 +// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP41]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP42]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP43]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP44]], align 8 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9 +// CHECK1-NEXT: store i64 8, ptr [[TMP45]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12 +// CHECK1-NEXT: store i32 1024, ptr [[TMP48]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]]) +// CHECK1-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK1-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]] +// CHECK1: omp_offload.failed2: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT3]] +// CHECK1: omp_offload.cont3: +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: ret i32 [[TMP51]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88 +// CHECK1-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double +// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK1-NEXT: store double [[ADD]], ptr [[A]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93 +// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK1-NEXT: store double 2.500000e+00, ptr [[A]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71 +// CHECK1-SAME: (i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK1-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8 +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]]) +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0 +// CHECK1-NEXT: br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0 +// CHECK1-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP4]] to i32 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]]) +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK1-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK1-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK1-NEXT: store i32 [[ADD11]], ptr [[I4]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75 +// CHECK1-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map. +// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8 +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTADDR2]], align 8 +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTADDR3]], align 8 +// CHECK1-NEXT: store ptr [[TMP9]], ptr [[TMP10]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry. +// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1 +// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]]) +// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]]) +// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]]) +// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]]) +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META33:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]] +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK1-NEXT: store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 +// CHECK1-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 +// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP21]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP22]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 +// CHECK1-NEXT: store i64 9, ptr [[TMP24]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 +// CHECK1-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]]) +// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] +// CHECK1: omp_offload.failed.i: +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK1-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i64 [[TMP31]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] +// CHECK1: .omp_outlined..exit: +// CHECK1-NEXT: ret i32 0 +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK1-SAME: () #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60 +// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK1-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK1-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_Z3bari +// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN2S12r1Ei(ptr noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP0]]) +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]] +// CHECK3-NEXT: store i32 [[ADD]], ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[CALL1:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP2]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]] +// CHECK3-NEXT: store i32 [[ADD2]], ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[CALL3:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP4]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]] +// CHECK3-NEXT: store i32 [[ADD4]], ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: ret i32 [[TMP6]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@_ZN2S12r1Ei +// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 1, ptr [[B]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]] +// CHECK3-NEXT: store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[B]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr [[THIS1]], ptr [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr [[A]], ptr [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP5]], ptr [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP5]], ptr [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP18]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 3, ptr [[TMP19]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP15]], ptr [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP16]], ptr [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK3-NEXT: store ptr @.offload_sizes, ptr [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP26]], align 8 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK3-NEXT: store i64 4, ptr [[TMP27]], align 8 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK3-NEXT: store i32 [[TMP17]], ptr [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]]) +// CHECK3-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK3-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK3: omp_offload.failed: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK3: omp_offload.cont: +// CHECK3-NEXT: [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK3-NEXT: store ptr [[THIS1]], ptr [[TMP33]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK3-NEXT: store ptr [[A2]], ptr [[TMP34]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP35]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP38]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK3-NEXT: store i32 1, ptr [[TMP39]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP36]], ptr [[TMP40]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP37]], ptr [[TMP41]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK3-NEXT: store ptr @.offload_sizes.1, ptr [[TMP42]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP43]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP44]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP45]], align 4 +// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP46]], align 8 +// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK3-NEXT: store i64 0, ptr [[TMP47]], align 8 +// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4 +// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4 +// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK3-NEXT: store i32 1024, ptr [[TMP50]], align 4 +// CHECK3-NEXT: [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK3-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK3-NEXT: br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK3: omp_offload.failed7: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK3: omp_offload.cont8: +// CHECK3-NEXT: [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP53:%.*]] = load double, ptr [[A9]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = fptosi double [[TMP53]] to i32 +// CHECK3-NEXT: ret i32 [[CONV]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@_ZL7fstatici +// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32 +// CHECK3-NEXT: store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP4]], ptr [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP4]], ptr [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP6]], ptr [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP6]], ptr [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP8]], ptr [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP8]], ptr [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr null, ptr [[TMP17]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK3-NEXT: [[TMP24:%.*]] = zext i32 [[ADD]] to i64 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 3, ptr [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP18]], ptr [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP19]], ptr [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK3-NEXT: store ptr @.offload_sizes.3, ptr [[TMP31]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP32]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP33]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP34]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK3-NEXT: store i64 [[TMP24]], ptr [[TMP35]], align 8 +// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK3-NEXT: store i64 8, ptr [[TMP36]], align 8 +// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK3-NEXT: store i32 [[TMP25]], ptr [[TMP39]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]]) +// CHECK3-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK3-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK3: omp_offload.failed: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP8]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK3: omp_offload.cont: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 32, [[TMP42]] +// CHECK3-NEXT: store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK3-NEXT: store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP44]], ptr [[TMP45]], align 4 +// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP44]], ptr [[TMP46]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP47]], align 4 +// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK3-NEXT: store i32 [[TMP51]], ptr [[TMP50]], align 4 +// CHECK3-NEXT: [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 36, i32 4, ptr @.omp_task_entry., i64 -1) +// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 4 +// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i32 4, i1 false) +// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0 +// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP57]], ptr align 4 @.offload_sizes.5, i32 8, i1 false) +// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1 +// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP58]], ptr align 4 [[TMP48]], i32 4, i1 false) +// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2 +// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP59]], ptr align 4 [[TMP49]], i32 4, i1 false) +// CHECK3-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]]) +// CHECK3-NEXT: [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1 +// CHECK3-NEXT: ret i32 [[ADD12]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i +// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK3-NEXT: [[B:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK3-NEXT: store ptr null, ptr [[TMP3]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK3-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK3-NEXT: store ptr null, ptr [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP8]], align 8 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK3-NEXT: store i64 8, ptr [[TMP9]], align 8 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK3-NEXT: store i32 20, ptr [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]]) +// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK3: omp_offload.failed: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK3: omp_offload.cont: +// CHECK3-NEXT: store i16 1, ptr [[B]], align 2 +// CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[B]], align 2 +// CHECK3-NEXT: store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], ptr [[A_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[A_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i16, ptr [[B]], align 2 +// CHECK3-NEXT: store i16 [[TMP18]], ptr [[B_CASTED]], align 2 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK3-NEXT: store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP17]], ptr [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP17]], ptr [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP19]], ptr [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP19]], ptr [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CHECK3-NEXT: store ptr null, ptr [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP21]], ptr [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP21]], ptr [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr null, ptr [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK3-NEXT: [[TMP34:%.*]] = sext i16 [[TMP33]] to i32 +// CHECK3-NEXT: [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0 +// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP36]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1 +// CHECK3-NEXT: store i32 3, ptr [[TMP37]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP31]], ptr [[TMP38]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP32]], ptr [[TMP39]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4 +// CHECK3-NEXT: store ptr @.offload_sizes.7, ptr [[TMP40]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP41]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP42]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP43]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP44]], align 8 +// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9 +// CHECK3-NEXT: store i64 8, ptr [[TMP45]], align 8 +// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4 +// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12 +// CHECK3-NEXT: store i32 1024, ptr [[TMP48]], align 4 +// CHECK3-NEXT: [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]]) +// CHECK3-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK3-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]] +// CHECK3: omp_offload.failed2: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT3]] +// CHECK3: omp_offload.cont3: +// CHECK3-NEXT: [[TMP51:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: ret i32 [[TMP51]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88 +// CHECK3-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double +// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK3-NEXT: store double [[ADD]], ptr [[A]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93 +// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK3-NEXT: store double 2.500000e+00, ptr [[A]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71 +// CHECK3-SAME: (i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4 +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]]) +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +// CHECK3-NEXT: br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0 +// CHECK3-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK3-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]]) +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK3-NEXT: [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1 +// CHECK3-NEXT: [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]] +// CHECK3-NEXT: store i32 [[ADD10]], ptr [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75 +// CHECK3-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@.omp_task_privates_map. +// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4 +// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4 +// CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4 +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR3]], align 4 +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTADDR1]], align 4 +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTADDR2]], align 4 +// CHECK3-NEXT: store ptr [[TMP9]], ptr [[TMP10]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry. +// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1 +// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]]) +// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]]) +// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]]) +// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]]) +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META34:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]] +// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK3-NEXT: store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 +// CHECK3-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 +// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP21]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP22]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias [[META34]] +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 +// CHECK3-NEXT: store i64 9, ptr [[TMP24]], align 8, !noalias [[META34]] +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 +// CHECK3-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]]) +// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK3-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] +// CHECK3: omp_offload.failed.i: +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK3-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i32 [[TMP31]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] +// CHECK3: .omp_outlined..exit: +// CHECK3-NEXT: ret i32 0 +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK3-SAME: () #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60 +// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK3-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK3-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]] +// CHECK3-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]) +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK9-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]] +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]] +// CHECK9-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]]) +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0 +// CHECK9-NEXT: br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0 +// CHECK9-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = trunc i64 [[TMP4]] to i32 +// CHECK9-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK9-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]] +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK9-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]]) +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK9-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK9-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK9-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK9-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK9-NEXT: store i32 [[ADD11]], ptr [[I4]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK9-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double +// CHECK9-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK9-NEXT: store double [[ADD]], ptr [[A]], align 8 +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK9-NEXT: store double 2.500000e+00, ptr [[A]], align 8 +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2 +// CHECK9-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK9-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0) +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8 +// CHECK9-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK9-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2 +// CHECK9-NEXT: [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK9-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]] +// CHECK9-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4 +// CHECK9-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]) +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]] +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK11-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP20]]) +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +// CHECK11-NEXT: br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0 +// CHECK11-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK11-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]]) +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK11-NEXT: [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1 +// CHECK11-NEXT: [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]] +// CHECK11-NEXT: store i32 [[ADD10]], ptr [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double +// CHECK11-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK11-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK11-NEXT: store double [[ADD]], ptr [[A]], align 4 +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK11-NEXT: store double 2.500000e+00, ptr [[A]], align 4 +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2 +// CHECK11-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK11-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK11-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK11-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]] +// CHECK11-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: ret void +// diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h index 7bec7e0c6736d..1ac9ac040468c 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -190,6 +190,16 @@ enum class OMPScheduleType { LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask) }; +/// The fallback types for the dyn_groupprivate clause. +enum class OMPDynGroupprivateFallbackType : uint64_t { + /// Abort the execution. + Abort = 0, + /// Return null pointer. + Null = 1, + /// Allocate from a implementation defined memory space. + DefaultMem = 2 +}; + // Default OpenMP mapper name suffix. inline constexpr const char *OmpDefaultMapperName = ".omp.default.mapper"; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index fd6b9729658c1..9f77c24d0b27b 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -2446,20 +2446,24 @@ class OpenMPIRBuilder { /// The number of threads. ArrayRef NumThreads; /// The size of the dynamic shared memory. - Value *DynCGGroupMem = nullptr; + Value *DynCGroupMem = nullptr; /// True if the kernel has 'no wait' clause. bool HasNoWait = false; + /// The fallback mechanism for the shared memory. + omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback = + omp::OMPDynGroupprivateFallbackType::Abort; // Constructors for TargetKernelArgs. TargetKernelArgs() = default; TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs, Value *NumIterations, ArrayRef NumTeams, - ArrayRef NumThreads, Value *DynCGGroupMem, - bool HasNoWait) + ArrayRef NumThreads, Value *DynCGroupMem, + bool HasNoWait, + omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback) : NumTargetItems(NumTargetItems), RTArgs(RTArgs), NumIterations(NumIterations), NumTeams(NumTeams), - NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem), - HasNoWait(HasNoWait) {} + NumThreads(NumThreads), DynCGroupMem(DynCGroupMem), + HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {} }; /// Create the kernel args vector used by emitTargetKernel. This function @@ -3244,6 +3248,10 @@ class OpenMPIRBuilder { /// dependency information as passed in the depend clause /// \param HasNowait Whether the target construct has a `nowait` clause or /// not. + /// \param DynCGroupMem The size of the dynamic groupprivate memory for each + /// cgroup. + /// \param DynCGroupMem The fallback mechanism to execute if the requested + /// cgroup memory cannot be provided. LLVM_ABI InsertPointOrErrorTy createTarget( const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, @@ -3255,7 +3263,10 @@ class OpenMPIRBuilder { TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, - const SmallVector &Dependencies, bool HasNowait = false); + const SmallVector &Dependencies, bool HasNowait = false, + Value *DynCGroupMem = nullptr, + omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback = + omp::OMPDynGroupprivateFallbackType::Abort); /// Returns __kmpc_for_static_init_* runtime function for the specified /// size \a IVSize and sign \a IVSigned. Will create a distribute call diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 7dc32fda0eed6..18a4f0a08cb5f 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -530,7 +530,13 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, auto Int32Ty = Type::getInt32Ty(Builder.getContext()); constexpr size_t MaxDim = 3; Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim)); - Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); + + Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait); + + Value *DynCGroupMemFallbackFlag = + Builder.getInt64(static_cast(KernelArgs.DynCGroupMemFallback)); + DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2); + Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag); assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty()); @@ -559,7 +565,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, Flags, NumTeams3D, NumThreads3D, - KernelArgs.DynCGGroupMem}; + KernelArgs.DynCGroupMem}; } void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { @@ -8224,7 +8230,8 @@ static void emitTargetCall( OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector &Dependencies, - bool HasNoWait) { + bool HasNoWait, Value *DynCGroupMem, + OMPDynGroupprivateFallbackType DynCGroupMemFallback) { // Generate a function call to the host fallback implementation of the target // region. This is called by the host when no offload entry was generated for // the target region and when the offloading call fails at runtime. @@ -8360,12 +8367,13 @@ static void emitTargetCall( /*isSigned=*/false) : Builder.getInt64(0); - // TODO: Use correct DynCGGroupMem - Value *DynCGGroupMem = Builder.getInt32(0); + // Request zero groupprivate bytes by default. + if (!DynCGroupMem) + DynCGroupMem = Builder.getInt32(0); - KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount, - NumTeamsC, NumThreadsC, - DynCGGroupMem, HasNoWait); + KArgs = OpenMPIRBuilder::TargetKernelArgs( + NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem, + HasNoWait, DynCGroupMemFallback); // Assume no error was returned because TaskBodyCB and // EmitTargetCallFallbackCB don't produce any. @@ -8414,7 +8422,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, - const SmallVector &Dependencies, bool HasNowait) { + const SmallVector &Dependencies, bool HasNowait, + Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) { if (!updateToLocation(Loc)) return InsertPointTy(); @@ -8437,7 +8446,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( if (!Config.isTargetDevice()) emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB, - CustomMapperCB, Dependencies, HasNowait); + CustomMapperCB, Dependencies, HasNowait, DynCGroupMem, + DynCGroupMemFallback); return Builder.saveIP(); } From 20e1a1248013fcee994f5de13338b96777fe00cd Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Mon, 10 Nov 2025 13:25:20 -0800 Subject: [PATCH 13/30] [LLDB] Fix (more) darwin shell tests under ASAN --- lldb/test/Shell/helper/build.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lldb/test/Shell/helper/build.py b/lldb/test/Shell/helper/build.py index a5a7e997be044..1fa8aab92c128 100755 --- a/lldb/test/Shell/helper/build.py +++ b/lldb/test/Shell/helper/build.py @@ -804,7 +804,19 @@ def _get_link_command(self): args.extend(self._obj_file_names()) if sys.platform == "darwin": + # By default, macOS doesn't allow injecting the ASAN + # runtime into system processes. + system_clang = ( + subprocess.check_output(["xcrun", "-find", "clang"]) + .strip() + .decode("utf-8") + ) + system_liblto = os.path.join( + os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib" + ) args.extend(["-isysroot", self.apple_sdk]) + args.extend(["-Wl,-lto_library", "-Wl," + system_liblto]) + elif self.objc_gnustep_lib: args.extend(["-L", self.objc_gnustep_lib, "-lobjc"]) if sys.platform == "linux": From fb2fa21bd64a8c21d73438d7ea93dd6f15d15112 Mon Sep 17 00:00:00 2001 From: jofrn Date: Mon, 10 Nov 2025 16:27:55 -0500 Subject: [PATCH 14/30] [AMDGPU] Remove calling conv check on entry function (#162080) It is undefined behavior to call a function with a mismatched calling convention. Rather than crash on this behavior, it should compile. This LLVM defect was identified via the AMD Fuzzing project. --- .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 7 -- llvm/test/CodeGen/AMDGPU/cc-entry.ll | 69 +++++++++++++++++++ 2 files changed, 69 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/cc-entry.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 0ea9add891111..b03d50f2d451d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -261,13 +261,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( const Function *Callee = getCalleeFunction(*CalleeOp); - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - auto isSameFunction = [](const MachineFunction &MF, const Function *F) { return F == &MF.getFunction(); }; diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll new file mode 100644 index 0000000000000..d807f321c4009 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll @@ -0,0 +1,69 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_kernel void @entry_fn() { +; CHECK-LABEL: entry_fn: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_sext_i32_i16 s5, s5 +; CHECK-NEXT: s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+8 +; CHECK-NEXT: s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+16 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_endpgm +entry: + call void @entry_fn() + ret void +} + +define void @caller() { +; CHECK-LABEL: caller: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_mov_b32 s0, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: s_add_co_i32 s32, s32, 16 +; CHECK-NEXT: v_writelane_b32 v40, s0, 2 +; CHECK-NEXT: s_mov_b64 s[0:1], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_sext_i32_i16 s5, s5 +; CHECK-NEXT: s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+12 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+24 +; CHECK-NEXT: v_mov_b32_e32 v0, v31 +; CHECK-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: s_mov_b64 s[2:3], s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], s[10:11] +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s0, v40, 2 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: s_mov_b32 s33, s0 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + call void @entry_fn() + ret void +} From 4b9d7e167b2ab6b7cc6d23516fad8c39ea9bc55f Mon Sep 17 00:00:00 2001 From: Jackson Stogel Date: Mon, 10 Nov 2025 13:39:34 -0800 Subject: [PATCH 15/30] Reapply "[libc] Return errno from OFD failure paths in fcntl." (#166658) (#166846) The previous implementation in #166252 (rolled back in #166658) caused buildbot failures due to a bug in an added test. The modified `UseAfterClose` did not pass a `struct flock` value to `fcntl`: https://github.com/llvm/llvm-project/blob/2d5170594147b42a37698760d6e0194eec4f1390/libc/test/src/fcntl/fcntl_test.cpp#L175 Which ASAN caught and errored in the `fcntl` implementation when the unspecified argument was accessed: https://github.com/llvm/llvm-project/blob/c12cb2892c808af459eaa270b8738a2b375ecc9b/libc/src/__support/OSUtil/linux/fcntl.cpp#L59 --- libc/src/__support/OSUtil/linux/fcntl.cpp | 2 +- libc/test/src/fcntl/fcntl_test.cpp | 161 +++++++++++++--------- 2 files changed, 100 insertions(+), 63 deletions(-) diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp index bb76eee90efd2..08db4859c6417 100644 --- a/libc/src/__support/OSUtil/linux/fcntl.cpp +++ b/libc/src/__support/OSUtil/linux/fcntl.cpp @@ -66,7 +66,7 @@ ErrorOr fcntl(int fd, int cmd, void *arg) { LIBC_NAMESPACE::syscall_impl(FCNTL_SYSCALL_ID, fd, cmd, &flk64); // On failure, return if (ret < 0) - return Error(-1); + return Error(-ret); // Check for overflow, i.e. the offsets are not the same when cast // to off_t from off64_t. if (static_cast(flk64.l_len) != flk64.l_len || diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp index 84feb34e537a0..d008aea54b425 100644 --- a/libc/test/src/fcntl/fcntl_test.cpp +++ b/libc/test/src/fcntl/fcntl_test.cpp @@ -94,68 +94,105 @@ TEST_F(LlvmLibcFcntlTest, FcntlSetFl) { ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); } -TEST_F(LlvmLibcFcntlTest, FcntlGetLkRead) { - using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test"; - auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); - - struct flock flk, svflk; - int retVal; - int fd = - LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(fd, 0); - - flk.l_type = F_RDLCK; - flk.l_start = 0; - flk.l_whence = SEEK_SET; - flk.l_len = 50; - - // copy flk into svflk - svflk = flk; - - retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - ASSERT_NE((int)flk.l_type, F_WRLCK); // File should not be write locked. - - retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - - ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); -} - -TEST_F(LlvmLibcFcntlTest, FcntlGetLkWrite) { - using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test"; - auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); - - struct flock flk, svflk; - int retVal; - int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(fd, 0); - - flk.l_type = F_WRLCK; - flk.l_start = 0; - flk.l_whence = SEEK_SET; - flk.l_len = 0; - - // copy flk into svflk - svflk = flk; - - retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - ASSERT_NE((int)flk.l_type, F_RDLCK); // File should not be read locked. - - retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - - ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); -} +/* Tests that are common between OFD and traditional variants of fcntl locks. */ +template +class LibcFcntlCommonLockTests : public LlvmLibcFcntlTest { +public: + void GetLkRead() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + + struct flock flk = {}; + struct flock svflk = {}; + int retVal; + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + flk.l_type = F_RDLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 50; + + // copy flk into svflk + svflk = flk; + + retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + ASSERT_NE((int)svflk.l_type, F_WRLCK); // File should not be write locked. + + retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + } + + void GetLkWrite() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + + struct flock flk = {}; + struct flock svflk = {}; + int retVal; + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + flk.l_type = F_WRLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 0; + + // copy flk into svflk + svflk = flk; + + retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + ASSERT_NE((int)svflk.l_type, F_RDLCK); // File should not be read locked. + + retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + } + + void UseAfterClose() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = + "testdata/fcntl_use_after_close.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + + flock flk = {}; + flk.l_type = F_RDLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 50; + ASSERT_EQ(-1, LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &flk)); + ASSERT_ERRNO_EQ(EBADF); + } +}; + +#define COMMON_LOCK_TESTS(NAME, GETLK_CMD, SETLK_CMD) \ + using NAME = LibcFcntlCommonLockTests; \ + TEST_F(NAME, GetLkRead) { GetLkRead(); } \ + TEST_F(NAME, GetLkWrite) { GetLkWrite(); } \ + TEST_F(NAME, UseAfterClose) { UseAfterClose(); } \ + static_assert(true, "Require semicolon.") + +COMMON_LOCK_TESTS(LlvmLibcFcntlProcessAssociatedLockTest, F_GETLK, F_SETLK); +COMMON_LOCK_TESTS(LlvmLibcFcntlOpenFileDescriptionLockTest, F_OFD_GETLK, + F_OFD_SETLK); TEST_F(LlvmLibcFcntlTest, UseAfterClose) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; From 8c86bc89c161f932ea2dfa0ac23b465d65ac048f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Nov 2025 13:54:35 -0800 Subject: [PATCH 16/30] AMDGPU/GlobalISel: Fix AGPR regbank check for mfma_scale (#167393) Fixes regressions with #159493 after 476a6ea9575 --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 54ba2f8c0d519..dbca5afeca816 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5081,17 +5081,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned MinNumRegsRequired = DstSize / 32; const SIMachineFunctionInfo *Info = MF.getInfo(); + bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired); + OpdsMapping[0] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); From 0767c640437a8ff02e431a350168a13cb9056428 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 10 Nov 2025 21:55:18 +0000 Subject: [PATCH 17/30] [VPlan] Use getDefiningRecipe instead of directly accessing Def. (NFC) Use getDefiningRecipe to future-proof the code. Split off from https://github.com/llvm/llvm-project/pull/156262 as suggested. --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5e4303a4c5fff..90696ffc3aca7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -99,20 +99,20 @@ VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) VPValue::~VPValue() { assert(Users.empty() && "trying to delete a VPValue with remaining users"); - if (Def) + if (VPDef *Def = getDefiningRecipe()) Def->removeDefinedValue(this); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { - if (const VPRecipeBase *R = dyn_cast_or_null(Def)) + if (const VPRecipeBase *R = getDefiningRecipe()) R->print(OS, "", SlotTracker); else printAsOperand(OS, SlotTracker); } void VPValue::dump() const { - const VPRecipeBase *Instr = dyn_cast_or_null(this->Def); + const VPRecipeBase *Instr = getDefiningRecipe(); VPSlotTracker SlotTracker( (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); print(dbgs(), SlotTracker); From a1934ee500dc0bc7f4ad6f989a83eaec90b09597 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 10 Nov 2025 14:04:01 -0800 Subject: [PATCH 18/30] [NFC][SpecialCaseList] Replace callback with return value (#165943) This commit introduces `SpecialCaseList::Match`, a small struct to hold the matched rule and its line number. This simplifies the `match` methods by allowing them to return a single value instead of using a callback. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- llvm/include/llvm/Support/SpecialCaseList.h | 24 +++---- llvm/lib/Support/SpecialCaseList.cpp | 71 +++++++++++---------- 2 files changed, 45 insertions(+), 50 deletions(-) diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h index dcc68b73d0675..dee4db584c88b 100644 --- a/llvm/include/llvm/Support/SpecialCaseList.h +++ b/llvm/include/llvm/Support/SpecialCaseList.h @@ -126,15 +126,16 @@ class SpecialCaseList { SpecialCaseList &operator=(SpecialCaseList const &) = delete; private: + using Match = std::pair; + static constexpr Match NotMatched = {"", 0}; + // Lagacy v1 matcher. class RegexMatcher { public: LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); LLVM_ABI void preprocess(bool BySize); - LLVM_ABI void - match(StringRef Query, - llvm::function_ref Cb) const; + LLVM_ABI Match match(StringRef Query) const; struct Reg { Reg(StringRef Name, unsigned LineNo, Regex &&Rg) @@ -152,9 +153,7 @@ class SpecialCaseList { LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); LLVM_ABI void preprocess(bool BySize); - LLVM_ABI void - match(StringRef Query, - llvm::function_ref Cb) const; + LLVM_ABI Match match(StringRef Query) const; struct Glob { Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern) @@ -168,11 +167,10 @@ class SpecialCaseList { RadixTree, RadixTree, - SmallVector>> + SmallVector>> PrefixSuffixToGlob; - RadixTree, - SmallVector> + RadixTree, SmallVector> SubstrToGlob; }; @@ -184,14 +182,10 @@ class SpecialCaseList { LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); LLVM_ABI void preprocess(bool BySize); - LLVM_ABI void - match(StringRef Query, - llvm::function_ref Cb) const; + LLVM_ABI Match match(StringRef Query) const; LLVM_ABI bool matchAny(StringRef Query) const { - bool R = false; - match(Query, [&](StringRef, unsigned) { R = true; }); - return R; + return match(Query) != NotMatched; } std::variant M; diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 465e10ae87588..395a55d75acd4 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -15,11 +15,13 @@ #include "llvm/Support/SpecialCaseList.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -63,12 +65,12 @@ void SpecialCaseList::RegexMatcher::preprocess(bool BySize) { } } -void SpecialCaseList::RegexMatcher::match( - StringRef Query, - llvm::function_ref Cb) const { +SpecialCaseList::Match +SpecialCaseList::RegexMatcher::match(StringRef Query) const { for (const auto &R : reverse(RegExes)) if (R.Rg.match(Query)) - return Cb(R.Name, R.LineNo); + return {R.Name, R.LineNo}; + return NotMatched; } Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern, @@ -90,7 +92,7 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { }); } - for (const auto &G : reverse(Globs)) { + for (const auto &[Idx, G] : enumerate(Globs)) { StringRef Prefix = G.Pattern.prefix(); StringRef Suffix = G.Pattern.suffix(); @@ -102,26 +104,29 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { // But only if substring is not empty. Searching this tree is more // expensive. auto &V = SubstrToGlob.emplace(Substr).first->second; - V.emplace_back(&G); + V.emplace_back(Idx); continue; } } auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second; auto &V = SToGlob.emplace(reverse(Suffix)).first->second; - V.emplace_back(&G); + V.emplace_back(Idx); } } -void SpecialCaseList::GlobMatcher::match( - StringRef Query, - llvm::function_ref Cb) const { +SpecialCaseList::Match +SpecialCaseList::GlobMatcher::match(StringRef Query) const { + int Best = -1; if (!PrefixSuffixToGlob.empty()) { for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) { for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) { - for (const auto *G : V) { - if (G->Pattern.match(Query)) { - Cb(G->Name, G->LineNo); + for (int Idx : reverse(V)) { + if (Best > Idx) + break; + const GlobMatcher::Glob &G = Globs[Idx]; + if (G.Pattern.match(Query)) { + Best = Idx; // As soon as we find a match in the vector, we can break for this // vector, since the globs are already sorted by priority within the // prefix group. However, we continue searching other prefix groups @@ -138,9 +143,12 @@ void SpecialCaseList::GlobMatcher::match( // possibilities. In most cases search will fail on first characters. for (StringRef Q = Query; !Q.empty(); Q = Q.drop_front()) { for (const auto &[_, V] : SubstrToGlob.find_prefixes(Q)) { - for (const auto *G : V) { - if (G->Pattern.match(Query)) { - Cb(G->Name, G->LineNo); + for (int Idx : reverse(V)) { + if (Best > Idx) + break; + const GlobMatcher::Glob &G = Globs[Idx]; + if (G.Pattern.match(Query)) { + Best = Idx; // As soon as we find a match in the vector, we can break for this // vector, since the globs are already sorted by priority within the // prefix group. However, we continue searching other prefix groups @@ -151,6 +159,9 @@ void SpecialCaseList::GlobMatcher::match( } } } + if (Best < 0) + return NotMatched; + return {Globs[Best].Name, Globs[Best].LineNo}; } SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) @@ -169,12 +180,11 @@ void SpecialCaseList::Matcher::preprocess(bool BySize) { return std::visit([&](auto &V) { return V.preprocess(BySize); }, M); } -void SpecialCaseList::Matcher::match( - StringRef Query, - llvm::function_ref Cb) const { +SpecialCaseList::Match SpecialCaseList::Matcher::match(StringRef Query) const { if (RemoveDotSlash) Query = llvm::sys::path::remove_leading_dotslash(Query); - return std::visit([&](auto &V) { return V.match(Query, Cb); }, M); + return std::visit( + [&](auto &V) -> SpecialCaseList::Match { return V.match(Query); }, M); } // TODO: Refactor this to return Expected<...> @@ -375,26 +385,17 @@ LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) { unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix, StringRef Query, StringRef Category) const { - unsigned LastLine = 0; - if (const Matcher *M = findMatcher(Prefix, Category)) { - M->match(Query, [&](StringRef, unsigned LineNo) { - LastLine = std::max(LastLine, LineNo); - }); - } - return LastLine; + if (const Matcher *M = findMatcher(Prefix, Category)) + return M->match(Query).second; + return 0; } StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix, StringRef Query, StringRef Category) const { - StringRef LongestRule; - if (const Matcher *M = findMatcher(Prefix, Category)) { - M->match(Query, [&](StringRef Rule, unsigned) { - if (LongestRule.size() < Rule.size()) - LongestRule = Rule; - }); - } - return LongestRule; + if (const Matcher *M = findMatcher(Prefix, Category)) + return M->match(Query).first; + return {}; } bool SpecialCaseList::Section::hasPrefix(StringRef Prefix) const { From 046ae855360614c5980d0ced0c55b5de507bd9ad Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Mon, 10 Nov 2025 14:17:23 -0800 Subject: [PATCH 19/30] [scudo] Small cleanup of memory tagging code. (#166860) Make the systemSupportsMemoryTagging() function return even on system that don't support memory tagging. This avoids the need to always check if memory tagging is supported before calling th function. Make systemSupportsMemoryTagging() cache the getauxval return value instead of calling the function every time. Updated the code that calls systemSupportsMemoryTagging(). --- compiler-rt/lib/scudo/standalone/combined.h | 3 +-- compiler-rt/lib/scudo/standalone/memtag.h | 7 +++---- compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp | 1 - compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp | 4 +--- .../lib/scudo/standalone/tests/wrappers_cpp_test.cpp | 3 +-- 5 files changed, 6 insertions(+), 12 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index ffe9554203241..2d0f4f53d2ac5 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -171,8 +171,7 @@ class Allocator { Primary.Options.set(OptionBit::DeallocTypeMismatch); if (getFlags()->delete_size_mismatch) Primary.Options.set(OptionBit::DeleteSizeMismatch); - if (allocatorSupportsMemoryTagging() && - systemSupportsMemoryTagging()) + if (systemSupportsMemoryTagging()) Primary.Options.set(OptionBit::UseMemoryTagging); QuarantineMaxChunkSize = diff --git a/compiler-rt/lib/scudo/standalone/memtag.h b/compiler-rt/lib/scudo/standalone/memtag.h index 83ebe676433eb..52897b3dca6ae 100644 --- a/compiler-rt/lib/scudo/standalone/memtag.h +++ b/compiler-rt/lib/scudo/standalone/memtag.h @@ -66,7 +66,8 @@ inline bool systemSupportsMemoryTagging() { #ifndef HWCAP2_MTE #define HWCAP2_MTE (1 << 18) #endif - return getauxval(AT_HWCAP2) & HWCAP2_MTE; + static bool SupportsMemoryTagging = getauxval(AT_HWCAP2) & HWCAP2_MTE; + return SupportsMemoryTagging; } inline bool systemDetectsMemoryTagFaultsTestOnly() { @@ -261,9 +262,7 @@ inline uptr loadTag(uptr Ptr) { #else -inline NORETURN bool systemSupportsMemoryTagging() { - UNREACHABLE("memory tagging not supported"); -} +inline bool systemSupportsMemoryTagging() { return false; } inline NORETURN bool systemDetectsMemoryTagFaultsTestOnly() { UNREACHABLE("memory tagging not supported"); diff --git a/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp b/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp index 09093e11452dd..d0d93316f212e 100644 --- a/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/memtag_test.cpp @@ -28,7 +28,6 @@ TEST(MemtagBasicDeathTest, Unsupported) { EXPECT_DEATH(untagPointer((uptr)0), "not supported"); EXPECT_DEATH(extractTag((uptr)0), "not supported"); - EXPECT_DEATH(systemSupportsMemoryTagging(), "not supported"); EXPECT_DEATH(systemDetectsMemoryTagFaultsTestOnly(), "not supported"); EXPECT_DEATH(enableSystemMemoryTaggingTestOnly(), "not supported"); diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp index 855a3e6e6109f..8741c8299b57c 100644 --- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp @@ -27,9 +27,7 @@ const scudo::uptr PageSize = scudo::getPageSizeCached(); template static scudo::Options getOptionsForConfig() { - if (!Config::getMaySupportMemoryTagging() || - !scudo::archSupportsMemoryTagging() || - !scudo::systemSupportsMemoryTagging()) + if (!scudo::systemSupportsMemoryTagging()) return {}; scudo::AtomicOptions AO; AO.set(scudo::OptionBit::UseMemoryTagging); diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp index c802ed22fbad0..84359251a02aa 100644 --- a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp @@ -187,8 +187,7 @@ TEST_F(ScudoWrappersCppTest, ThreadedNew) { // TODO: Investigate why libc sometimes crashes with tag missmatch in // __pthread_clockjoin_ex. std::unique_ptr NoTags; - if (!SCUDO_ANDROID && scudo::archSupportsMemoryTagging() && - scudo::systemSupportsMemoryTagging()) + if (!SCUDO_ANDROID && scudo::systemSupportsMemoryTagging()) NoTags = std::make_unique(); Ready = false; From 8b1cc2d5f5c9f44d39e39382e8628f258c1bf219 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 10 Nov 2025 22:18:34 +0000 Subject: [PATCH 20/30] [VPlan] Update canNarrowLoad to check WidenMember0's op first (NFCI). This hardens the code to check based on WideMember0's operands. This ensures each call will go through the same check. Should be NFC currently but needed when generalizing in follow-up patches. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f5bef08fafcdc..eab642678702a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -4147,13 +4147,13 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { /// is defined at \p Idx of a load interleave group. static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx) { - auto *DefR = OpV->getDefiningRecipe(); - if (!DefR) - return WideMember0->getOperand(OpIdx) == OpV; - if (auto *W = dyn_cast(DefR)) - return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV; - - if (auto *IR = dyn_cast(DefR)) + VPValue *Member0Op = WideMember0->getOperand(OpIdx); + VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe(); + if (!Member0OpR) + return Member0Op == OpV; + if (auto *W = dyn_cast(Member0OpR)) + return !W->getMask() && Member0Op == OpV; + if (auto *IR = dyn_cast(Member0OpR)) return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV; return false; } From 95db31e7f69e7008f0c570ed30d54d1e418e10f2 Mon Sep 17 00:00:00 2001 From: Rahman Lavaee Date: Mon, 10 Nov 2025 14:19:12 -0800 Subject: [PATCH 21/30] Treat specifying a function in the bbsection profile without any directive as noop. (#167359) --- .../Inputs/basic-block-sections.funcnames | 4 +- clang/test/CodeGen/basic-block-sections.c | 6 +- .../CodeGen/BasicBlockSectionsProfileReader.h | 16 ++--- llvm/lib/CodeGen/BasicBlockSections.cpp | 23 +++---- .../BasicBlockSectionsProfileReader.cpp | 16 ++--- .../CodeGen/X86/basic-block-sections-list.ll | 62 +++---------------- .../X86/basic-block-sections-source-drift.ll | 8 ++- 7 files changed, 45 insertions(+), 90 deletions(-) diff --git a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames index 329cea9a0adfb..2452ee345fe2f 100644 --- a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames +++ b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames @@ -1 +1,3 @@ -!world +v1 +f world +c 0 diff --git a/clang/test/CodeGen/basic-block-sections.c b/clang/test/CodeGen/basic-block-sections.c index a61b8dd4ac376..0c21a4cb1442c 100644 --- a/clang/test/CodeGen/basic-block-sections.c +++ b/clang/test/CodeGen/basic-block-sections.c @@ -30,8 +30,10 @@ int another(int a) { // // BB_WORLD: .section .text.world,"ax",@progbits{{$}} // BB_WORLD: world: -// BB_WORLD: .section .text.world,"ax",@progbits,unique -// BB_WORLD: world.__part.1: +// BB_ALL: .section .text.world,"ax",@progbits,unique +// BB_ALL: world.__part.1: +// BB_LIST: .section .text.split.world,"ax",@progbits +// BB_LIST: world.cold: // BB_ALL: .section .text.another,"ax",@progbits // BB_ALL: another.__part.1: // BB_LIST-NOT: .section .text.another,"ax",@progbits diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h index 7b1a5f5019589..ee1f28377f7e4 100644 --- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h +++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h @@ -68,17 +68,13 @@ class BasicBlockSectionsProfileReader { BasicBlockSectionsProfileReader() = default; - // Returns true if basic block sections profile exist for function \p - // FuncName. + // Returns true if function \p FuncName is hot based on the basic block + // section profile. bool isFunctionHot(StringRef FuncName) const; - // Returns a pair with first element representing whether basic block sections - // profile exist for the function \p FuncName, and the second element - // representing the basic block sections profile (cluster info) for this - // function. If the first element is true and the second element is empty, it - // means unique basic block sections are desired for all basic blocks of the - // function. - std::pair> + // Returns the cluster info for the function \p FuncName. Returns an empty + // vector if function has no cluster info. + SmallVector getClusterInfoForFunction(StringRef FuncName) const; // Returns the path clonings for the given function. @@ -190,7 +186,7 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass { bool isFunctionHot(StringRef FuncName) const; - std::pair> + SmallVector getClusterInfoForFunction(StringRef FuncName) const; SmallVector> diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp index e317e1c06741f..52e2909bec072 100644 --- a/llvm/lib/CodeGen/BasicBlockSections.cpp +++ b/llvm/lib/CodeGen/BasicBlockSections.cpp @@ -183,8 +183,7 @@ updateBranches(MachineFunction &MF, // clusters are ordered in increasing order of their IDs, with the "Exception" // and "Cold" succeeding all other clusters. // FuncClusterInfo represents the cluster information for basic blocks. It -// maps from BBID of basic blocks to their cluster information. If this is -// empty, it means unique sections for all basic blocks in the function. +// maps from BBID of basic blocks to their cluster information. static void assignSections(MachineFunction &MF, const DenseMap &FuncClusterInfo) { @@ -197,10 +196,8 @@ assignSections(MachineFunction &MF, for (auto &MBB : MF) { // With the 'all' option, every basic block is placed in a unique section. // With the 'list' option, every basic block is placed in a section - // associated with its cluster, unless we want individual unique sections - // for every basic block in this function (if FuncClusterInfo is empty). - if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All || - FuncClusterInfo.empty()) { + // associated with its cluster. + if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) { // If unique sections are desired for all basic blocks of the function, we // set every basic block's section ID equal to its original position in // the layout (which is equal to its number). This ensures that basic @@ -308,22 +305,22 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) { if (BBSectionsType == BasicBlockSection::List && hasInstrProfHashMismatch(MF)) return false; - // Renumber blocks before sorting them. This is useful for accessing the - // original layout positions and finding the original fallthroughs. - MF.RenumberBlocks(); DenseMap FuncClusterInfo; if (BBSectionsType == BasicBlockSection::List) { - auto [HasProfile, ClusterInfo] = - getAnalysis() - .getClusterInfoForFunction(MF.getName()); - if (!HasProfile) + auto ClusterInfo = getAnalysis() + .getClusterInfoForFunction(MF.getName()); + if (ClusterInfo.empty()) return false; for (auto &BBClusterInfo : ClusterInfo) { FuncClusterInfo.try_emplace(BBClusterInfo.BBID, BBClusterInfo); } } + // Renumber blocks before sorting them. This is useful for accessing the + // original layout positions and finding the original fallthroughs. + MF.RenumberBlocks(); + MF.setBBSectionsType(BBSectionsType); assignSections(MF, FuncClusterInfo); diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index 485b44ae4c4aa..c234c0f1b0b34 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -58,22 +58,24 @@ BasicBlockSectionsProfileReader::parseUniqueBBID(StringRef S) const { } bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const { - return getClusterInfoForFunction(FuncName).first; + return !getClusterInfoForFunction(FuncName).empty(); } -std::pair> +SmallVector BasicBlockSectionsProfileReader::getClusterInfoForFunction( StringRef FuncName) const { auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); - return R != ProgramPathAndClusterInfo.end() - ? std::pair(true, R->second.ClusterInfo) - : std::pair(false, SmallVector()); + return R != ProgramPathAndClusterInfo.end() ? R->second.ClusterInfo + : SmallVector(); } SmallVector> BasicBlockSectionsProfileReader::getClonePathsForFunction( StringRef FuncName) const { - return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName)).ClonePaths; + auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); + return R != ProgramPathAndClusterInfo.end() + ? R->second.ClonePaths + : SmallVector>(); } uint64_t BasicBlockSectionsProfileReader::getEdgeCount( @@ -494,7 +496,7 @@ bool BasicBlockSectionsProfileReaderWrapperPass::isFunctionHot( return BBSPR.isFunctionHot(FuncName); } -std::pair> +SmallVector BasicBlockSectionsProfileReaderWrapperPass::getClusterInfoForFunction( StringRef FuncName) const { return BBSPR.getClusterInfoForFunction(FuncName); diff --git a/llvm/test/CodeGen/X86/basic-block-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll index 45ef452f4f5c1..d652a540f3e9c 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-list.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll @@ -1,17 +1,13 @@ -;; Check the basic block sections list option. -;; version 0 profile: -; RUN: echo '!_Z3foob' > %t1 +;; Check that specifying the function in the basic block sections profile +;; without any other directives is a noop. ;; -;; version 1 profile: -; RUN: echo 'v1' > %t2 -; RUN: echo 'f _Z3foob' >> %t2 +;; Specify the bb sections profile: +; RUN: echo 'v1' > %t +; RUN: echo 'f _Z3foob' >> %t ;; -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t > %bbsections +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %orig +; RUN: diff -u %orig %bbsections define i32 @_Z3foob(i1 zeroext %0) nounwind { %2 = alloca i32, align 4 @@ -41,45 +37,3 @@ define i32 @_Z3foob(i1 zeroext %0) nounwind { declare i32 @_Z3barv() #1 declare i32 @_Z3bazv() #1 - -define i32 @_Z3zipb(i1 zeroext %0) nounwind { - %2 = alloca i32, align 4 - %3 = alloca i8, align 1 - %4 = zext i1 %0 to i8 - store i8 %4, ptr %3, align 1 - %5 = load i8, ptr %3, align 1 - %6 = trunc i8 %5 to i1 - %7 = zext i1 %6 to i32 - %8 = icmp sgt i32 %7, 0 - br i1 %8, label %9, label %11 - -9: ; preds = %1 - %10 = call i32 @_Z3barv() - store i32 %10, ptr %2, align 4 - br label %13 - -11: ; preds = %1 - %12 = call i32 @_Z3bazv() - store i32 %12, ptr %2, align 4 - br label %13 - -13: ; preds = %11, %9 - %14 = load i32, ptr %2, align 4 - ret i32 %14 -} - -; LINUX-SECTIONS-NO-GUIDED-PREFIX: .section .text._Z3foob,"ax",@progbits -; LINUX-SECTIONS: .section .text.hot._Z3foob,"ax",@progbits -; LINUX-SECTIONS: _Z3foob: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.1,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.1: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.2,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.2: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.3,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.3: - -; LINUX-SECTIONS-FUNCTION-SECTION: .section .text._Z3zipb,"ax",@progbits -; LINUX-SECTIONS-NO-FUNCTION-SECTION-NOT: .section .text{{.*}}._Z3zipb,"ax",@progbits -; LINUX-SECTIONS: _Z3zipb: -; LINUX-SECTIONS-NOT: .section .text{{.*}}._Z3zipb.__part.{{[0-9]+}},"ax",@progbits -; LINUX-SECTIONS-NOT: _Z3zipb.__part.{{[0-9]+}}: diff --git a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll index d481b147662dc..6e0db20ca0492 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll @@ -1,6 +1,8 @@ -; RUN: echo "!foo" > %t.order.txt -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt | FileCheck --check-prefix=SOURCE-DRIFT %s -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s +; RUN: echo "v1" > %t +; RUN: echo "f foo" >> %t +; RUN: echo "c 0" >> %t +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t | FileCheck --check-prefix=SOURCE-DRIFT %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s define dso_local i32 @foo(i1 zeroext %0, i1 zeroext %1) !annotation !1 { br i1 %0, label %5, label %3 From d5125b3089ddb4e704752656c011d13b4eab0a2c Mon Sep 17 00:00:00 2001 From: Zhen Wang <37195552+wangzpgi@users.noreply.github.com> Date: Mon, 10 Nov 2025 14:28:32 -0800 Subject: [PATCH 22/30] [flang][CUDA] Unify element size computation in CUF helpers (#167398) Refactor computeWidth from CUFOpConversion into a shared helper function computeElementByteSize in CUFCommon. --- .../flang/Optimizer/Builder/CUFCommon.h | 5 ++++ flang/lib/Optimizer/Builder/CUFCommon.cpp | 23 +++++++++++++++ .../Optimizer/Transforms/CUFOpConversion.cpp | 28 ++----------------- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h index 5c56dd6b695f8..6e2442745f9a0 100644 --- a/flang/include/flang/Optimizer/Builder/CUFCommon.h +++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h @@ -18,6 +18,7 @@ static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem"; namespace fir { class FirOpBuilder; +class KindMapping; } // namespace fir namespace cuf { @@ -34,6 +35,10 @@ bool isRegisteredDeviceAttr(std::optional attr); void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder); +int computeElementByteSize(mlir::Location loc, mlir::Type type, + fir::KindMapping &kindMap, + bool emitErrorOnFailure = true); + } // namespace cuf #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_ diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp index cf7588f275d22..461deb8e4cb55 100644 --- a/flang/lib/Optimizer/Builder/CUFCommon.cpp +++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp @@ -9,6 +9,7 @@ #include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" +#include "flang/Optimizer/Dialect/Support/KindMapping.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" @@ -91,3 +92,25 @@ void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) { } } } + +int cuf::computeElementByteSize(mlir::Location loc, mlir::Type type, + fir::KindMapping &kindMap, + bool emitErrorOnFailure) { + auto eleTy = fir::unwrapSequenceType(type); + if (auto t{mlir::dyn_cast(eleTy)}) + return t.getWidth() / 8; + if (auto t{mlir::dyn_cast(eleTy)}) + return t.getWidth() / 8; + if (auto t{mlir::dyn_cast(eleTy)}) + return kindMap.getLogicalBitsize(t.getFKind()) / 8; + if (auto t{mlir::dyn_cast(eleTy)}) { + int elemSize = + mlir::cast(t.getElementType()).getWidth() / 8; + return 2 * elemSize; + } + if (auto t{mlir::dyn_cast(eleTy)}) + return kindMap.getCharacterBitsize(t.getFKind()) / 8; + if (emitErrorOnFailure) + mlir::emitError(loc, "unsupported type"); + return 0; +} diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 8d00272b09f42..5b1b0a2f6feab 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -263,28 +263,6 @@ static bool inDeviceContext(mlir::Operation *op) { return false; } -static int computeWidth(mlir::Location loc, mlir::Type type, - fir::KindMapping &kindMap) { - auto eleTy = fir::unwrapSequenceType(type); - if (auto t{mlir::dyn_cast(eleTy)}) - return t.getWidth() / 8; - if (auto t{mlir::dyn_cast(eleTy)}) - return t.getWidth() / 8; - if (eleTy.isInteger(1)) - return 1; - if (auto t{mlir::dyn_cast(eleTy)}) - return kindMap.getLogicalBitsize(t.getFKind()) / 8; - if (auto t{mlir::dyn_cast(eleTy)}) { - int elemSize = - mlir::cast(t.getElementType()).getWidth() / 8; - return 2 * elemSize; - } - if (auto t{mlir::dyn_cast_or_null(eleTy)}) - return kindMap.getCharacterBitsize(t.getFKind()) / 8; - mlir::emitError(loc, "unsupported type"); - return 0; -} - struct CUFAllocOpConversion : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -320,7 +298,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern { mlir::Value bytes; fir::KindMapping kindMap{fir::getKindMapping(mod)}; if (fir::isa_trivial(op.getInType())) { - int width = computeWidth(loc, op.getInType(), kindMap); + int width = cuf::computeElementByteSize(loc, op.getInType(), kindMap); bytes = builder.createIntegerConstant(loc, builder.getIndexType(), width); } else if (auto seqTy = mlir::dyn_cast_or_null( @@ -330,7 +308,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern { mlir::Type structTy = typeConverter->convertType(seqTy.getEleTy()); size = dl->getTypeSizeInBits(structTy) / 8; } else { - size = computeWidth(loc, seqTy.getEleTy(), kindMap); + size = cuf::computeElementByteSize(loc, seqTy.getEleTy(), kindMap); } mlir::Value width = builder.createIntegerConstant(loc, builder.getIndexType(), size); @@ -704,7 +682,7 @@ struct CUFDataTransferOpConversion typeConverter->convertType(fir::unwrapSequenceType(dstTy)); width = dl->getTypeSizeInBits(structTy) / 8; } else { - width = computeWidth(loc, dstTy, kindMap); + width = cuf::computeElementByteSize(loc, dstTy, kindMap); } mlir::Value widthValue = mlir::arith::ConstantOp::create( rewriter, loc, i64Ty, rewriter.getIntegerAttr(i64Ty, width)); From 7b12a08f5e0d5e615dee2b62f9a68a03e68b6c93 Mon Sep 17 00:00:00 2001 From: AZero13 Date: Mon, 10 Nov 2025 17:32:31 -0500 Subject: [PATCH 23/30] [AArch64] Allow peephole to optimize AND + signed compare with 0 (#153608) This should be the peephole's job. Because and sets V flag to 0, this is why signed comparisons with 0 are okay to replace with tst. Note this is only for AArch64, because ANDS on ARM leaves the V flag the same. Fixes: https://github.com/llvm/llvm-project/issues/154387 --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 48 ++- .../AArch64/arm64-regress-opt-cmp-signed.mir | 55 +++ llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll | 332 ++++++++++++++++++ 3 files changed, 434 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 55a04cca4c394..5e95807a68199 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1780,6 +1780,16 @@ static unsigned sForm(MachineInstr &Instr) { case AArch64::SUBSWri: case AArch64::SUBSXrr: case AArch64::SUBSXri: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::BICSWrs: + case AArch64::BICSXrs: return Instr.getOpcode(); case AArch64::ADDWrr: @@ -1810,6 +1820,22 @@ static unsigned sForm(MachineInstr &Instr) { return AArch64::ANDSWri; case AArch64::ANDXri: return AArch64::ANDSXri; + case AArch64::ANDWrr: + return AArch64::ANDSWrr; + case AArch64::ANDWrs: + return AArch64::ANDSWrs; + case AArch64::ANDXrr: + return AArch64::ANDSXrr; + case AArch64::ANDXrs: + return AArch64::ANDSXrs; + case AArch64::BICWrr: + return AArch64::BICSWrr; + case AArch64::BICXrr: + return AArch64::BICSXrr; + case AArch64::BICWrs: + return AArch64::BICSWrs; + case AArch64::BICXrs: + return AArch64::BICSXrs; } } @@ -1947,6 +1973,25 @@ static bool isSUBSRegImm(unsigned Opcode) { return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; } +static bool isANDOpcode(MachineInstr &MI) { + unsigned Opc = sForm(MI); + switch (Opc) { + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + return true; + default: + return false; + } +} + /// Check if CmpInstr can be substituted by MI. /// /// CmpInstr can be substituted: @@ -1984,7 +2029,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, // 1) MI and CmpInstr set N and V to the same value. // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when // signed overflow occurs, so CmpInstr could still be simplified away. - if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) + // Note that Ands and Bics instructions always clear the V flag. + if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI)) return false; AccessKind AccessToCheck = AK_Write; diff --git a/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir new file mode 100644 index 0000000000000..8c31e7c2d1cec --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s | FileCheck %s +--- | + define i32 @test01() nounwind { + entry: + %0 = select i1 true, i32 1, i32 0 + %1 = and i32 %0, 65535 + %2 = icmp sgt i32 %1, 0 + br i1 %2, label %if.then, label %if.end + + if.then: ; preds = %entry + ret i32 1 + + if.end: ; preds = %entry + ret i32 0 + } +... +--- +name: test01 +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: gpr32common } +body: | + ; CHECK-LABEL: name: test01 + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1 + ; CHECK-NEXT: [[ANDSWri:%[0-9]+]]:gpr32common = ANDSWri killed [[ANDSWri]], 15, implicit-def $nzcv + ; CHECK-NEXT: Bcc 12, %bb.2, implicit $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.if.then: + ; CHECK-NEXT: $w0 = MOVi32imm 1 + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.end: + ; CHECK-NEXT: $w0 = MOVi32imm 0 + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + bb.0.entry: + successors: %bb.2.if.end, %bb.1.if.then + + %0 = MOVi32imm 1 + %1 = ANDWri killed %1, 15 + $wzr = SUBSWri killed %1, 0, 0, implicit-def $nzcv + Bcc 12, %bb.2.if.end, implicit $nzcv + + bb.1.if.then: + $w0 = MOVi32imm 1 + RET_ReallyLR implicit $w0 + + bb.2.if.end: + $w0 = MOVi32imm 0 + RET_ReallyLR implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll index 33c5ba7987974..8297fa2d4e3f9 100644 --- a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll +++ b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll @@ -161,6 +161,338 @@ define i1 @lt64_u16_and_23(i64 %0) { ret i1 %3 } +define i1 @test_disjoint(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp eq i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp sgt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp slt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint4(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp sle i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_4(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint_inverse_4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bic w8, w9, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp sle i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp eq i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint2_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp sgt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint3_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp slt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp eq i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint2_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp sgt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint3_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp slt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint4_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint4_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: and x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp sle i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_4_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_inverse_4_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bic x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp sle i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp eq i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint2_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp sgt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint3_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp slt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + ; negative test define i1 @lt3_u8(i8 %0) { ; CHECK-LABEL: lt3_u8: From bf3b704c60cc521b79ec54bd57fcf72368178a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Susan=20Tan=20=28=E3=82=B9-=E3=82=B6=E3=83=B3=E3=80=80?= =?UTF-8?q?=E3=82=BF=E3=83=B3=29?= Date: Mon, 10 Nov 2025 17:33:43 -0500 Subject: [PATCH 24/30] [flang][NFC] Characterize allocation based on MemAlloc effect instead of pattern matching (#166806) Flang alias analysis used to find allocation site by pattern matching allocation ops in mainly FIR dialect. This MR extends the characterization to instead characterize based on whether the result of an op has MemAlloc effect. --- .../include/flang/Optimizer/Dialect/FIROps.td | 12 +- .../lib/Optimizer/Analysis/AliasAnalysis.cpp | 57 ++++++-- flang/test/Driver/tco-emit-final-mlir.fir | 2 + flang/test/Fir/alloc.fir | 9 ++ .../test/Fir/omp-reduction-embox-codegen.fir | 2 + flang/test/Fir/pdt.fir | 6 +- flang/test/HLFIR/inline-hlfir-copy-in.fir | 2 +- flang/test/Lower/Intrinsics/c_f_pointer.f90 | 1 - flang/test/Lower/Intrinsics/system_clock.f90 | 2 - flang/test/Lower/allocatables.f90 | 6 +- .../test/Lower/character-local-variables.f90 | 11 ++ flang/test/Lower/derived-types.f90 | 2 + flang/test/Lower/do_loop_unstructured.f90 | 1 + flang/test/Lower/forall/array-pointer.f90 | 1 - .../test/Lower/forall/forall-allocatable.f90 | 17 ++- flang/test/Lower/loops.f90 | 1 - flang/test/Lower/polymorphic.f90 | 4 - flang/test/Lower/statement-function.f90 | 1 - flang/test/Transforms/stack-arrays.fir | 131 +++++++++++------- 19 files changed, 172 insertions(+), 96 deletions(-) diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index bae52d63fda45..289c79bd9b831 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -80,8 +80,7 @@ def AnyRefOfConstantSizeAggregateType : TypeConstraint< // Memory SSA operations //===----------------------------------------------------------------------===// -def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, - MemoryEffects<[MemAlloc]>]> { +def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments]> { let summary = "allocate storage for a temporary on the stack given a type"; let description = [{ This primitive operation is used to allocate an object on the stack. A @@ -162,7 +161,9 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, Variadic:$shape ); - let results = (outs fir_ReferenceType); + let results = + (outs Res]>:$res); let hasCustomAssemblyFormat = 1; let hasVerifier = 1; @@ -212,8 +213,7 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, }]; } -def fir_AllocMemOp : fir_Op<"allocmem", - [MemoryEffects<[MemAlloc]>, AttrSizedOperandSegments]> { +def fir_AllocMemOp : fir_Op<"allocmem", [AttrSizedOperandSegments]> { let summary = "allocate storage on the heap for an object of a given type"; let description = [{ @@ -235,7 +235,7 @@ def fir_AllocMemOp : fir_Op<"allocmem", Variadic:$typeparams, Variadic:$shape ); - let results = (outs fir_HeapType); + let results = (outs Res]>:$res); let hasCustomAssemblyFormat = 1; let hasVerifier = 1; diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 73ddd1ff80126..ef9894232b409 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -27,6 +27,26 @@ using namespace mlir; #define DEBUG_TYPE "fir-alias-analysis" +// Inspect for value-scoped Allocate effects and determine whether +// 'candidate' is a new allocation. Returns SourceKind::Allocate if a +// MemAlloc effect is attached +static fir::AliasAnalysis::SourceKind +classifyAllocateFromEffects(mlir::Operation *op, mlir::Value candidate) { + if (!op) + return fir::AliasAnalysis::SourceKind::Unknown; + auto interface = llvm::dyn_cast(op); + if (!interface) + return fir::AliasAnalysis::SourceKind::Unknown; + llvm::SmallVector effects; + interface.getEffects(effects); + for (mlir::MemoryEffects::EffectInstance &e : effects) { + if (mlir::isa(e.getEffect()) && + e.getValue() && e.getValue() == candidate) + return fir::AliasAnalysis::SourceKind::Allocate; + } + return fir::AliasAnalysis::SourceKind::Unknown; +} + //===----------------------------------------------------------------------===// // AliasAnalysis: alias //===----------------------------------------------------------------------===// @@ -535,6 +555,11 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, mlir::Operation *instantiationPoint{nullptr}; while (defOp && !breakFromLoop) { ty = defOp->getResultTypes()[0]; + // Value-scoped allocation detection via effects. + if (classifyAllocateFromEffects(defOp, v) == SourceKind::Allocate) { + type = SourceKind::Allocate; + break; + } llvm::TypeSwitch(defOp) .Case([&](auto op) { v = op.getVar(); @@ -554,11 +579,6 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, defOp = v.getDefiningOp(); } }) - .Case([&](auto op) { - // Unique memory allocation. - type = SourceKind::Allocate; - breakFromLoop = true; - }) .Case([&](auto op) { // Skip ConvertOp's and track further through the operand. v = op->getOperand(0); @@ -628,16 +648,23 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, type = SourceKind::Global; } else { auto def = llvm::cast(boxSrc.origin.u); - // TODO: Add support to fir.allocmem - if (auto allocOp = def.template getDefiningOp()) { - v = def; - defOp = v.getDefiningOp(); - type = SourceKind::Allocate; - } else if (isDummyArgument(def)) { - defOp = nullptr; - v = def; - } else { - type = SourceKind::Indirect; + bool classified = false; + if (auto defDefOp = def.getDefiningOp()) { + if (classifyAllocateFromEffects(defDefOp, def) == + SourceKind::Allocate) { + v = def; + defOp = defDefOp; + type = SourceKind::Allocate; + classified = true; + } + } + if (!classified) { + if (isDummyArgument(def)) { + defOp = nullptr; + v = def; + } else { + type = SourceKind::Indirect; + } } } breakFromLoop = true; diff --git a/flang/test/Driver/tco-emit-final-mlir.fir b/flang/test/Driver/tco-emit-final-mlir.fir index 75f8f153127af..7e934c921e773 100644 --- a/flang/test/Driver/tco-emit-final-mlir.fir +++ b/flang/test/Driver/tco-emit-final-mlir.fir @@ -15,5 +15,7 @@ func.func @_QPfoo() { %1 = fir.alloca i32 + %0 = arith.constant 0 : i32 + fir.store %0 to %1 : !fir.ref return } diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir index 8da8b828c18b9..613c8e274baad 100644 --- a/flang/test/Fir/alloc.fir +++ b/flang/test/Fir/alloc.fir @@ -372,8 +372,17 @@ func.func @alloca_unlimited_polymorphic_box() { %1 = fir.alloca !fir.class> %2 = fir.alloca !fir.box %3 = fir.alloca !fir.box> + // Add real uses so allocas are not trivially dead. + fir.call @__use_class_none(%0) : (!fir.ref>) -> () + fir.call @__use_class_array(%1) : (!fir.ref>>) -> () + fir.call @__use_box_none(%2) : (!fir.ref>) -> () + fir.call @__use_box_array(%3) : (!fir.ref>>) -> () return } +func.func private @__use_class_none(!fir.ref>) -> () +func.func private @__use_class_array(!fir.ref>>) -> () +func.func private @__use_box_none(!fir.ref>) -> () +func.func private @__use_box_array(!fir.ref>>) -> () // Note: allocmem of fir.box are not possible (fir::HeapType::verify does not // accept box types), so there is no equivalent of // alloca_unlimited_polymorphic_box for allocmem. diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir index 1645e1a407ad4..47fffb35a7d92 100644 --- a/flang/test/Fir/omp-reduction-embox-codegen.fir +++ b/flang/test/Fir/omp-reduction-embox-codegen.fir @@ -28,9 +28,11 @@ func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { omp.parallel reduction(byref @test_reduction %4 -> %arg0 : !fir.ref>) { omp.terminator } + func.call @__use_box_i32(%4) : (!fir.ref>) -> () return } +func.func private @__use_box_i32(!fir.ref>) -> () // basically we are testing that there isn't a crash // CHECK-LABEL: define void @_QQmain // CHECK-NEXT: alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 diff --git a/flang/test/Fir/pdt.fir b/flang/test/Fir/pdt.fir index a200cd7e7cc03..04f48e745d033 100644 --- a/flang/test/Fir/pdt.fir +++ b/flang/test/Fir/pdt.fir @@ -95,14 +95,14 @@ func.func @_QTt1P.f2.offset(%0 : i32, %1 : i32) -> i32 { // end program p func.func private @bar(!fir.ref>) +func.func private @__use_t1(!fir.ref,f2:!fir.char<1,?>}>>) -> () // CHECK-LABEL: define void @_QPfoo(i32 %0, i32 %1) func.func @_QPfoo(%arg0 : i32, %arg1 : i32) { // CHECK: %[[size:.*]] = call i64 @_QTt1P.mem.size(i32 %0, i32 %1) // CHECK: %[[alloc:.*]] = alloca i8, i64 %[[size]] %0 = fir.alloca !fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>(%arg0, %arg1 : i32, i32) - //%2 = fir.coordinate_of %0, f2 : (!fir.ref>) -> !fir.ref> - %2 = fir.zero_bits !fir.ref> - fir.call @bar(%2) : (!fir.ref>) -> () + // Keep alloca live without creating an unsupported coordinate_of on dynamic-sized field. + func.call @__use_t1(%0) : (!fir.ref,f2:!fir.char<1,?>}>>) -> () return } diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy-in.fir index f3c4b38962a0c..f1da1da9f9a5c 100644 --- a/flang/test/HLFIR/inline-hlfir-copy-in.fir +++ b/flang/test/HLFIR/inline-hlfir-copy-in.fir @@ -75,7 +75,7 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box> { // CHECK: %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box>) -> !fir.ref> // CHECK: %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref, !fir.ref, i1) // CHECK: fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath : (!fir.ref>, !fir.ref) -> () -// CHECK: hlfir.copy_out %16, %15#1 : (!fir.ref>>, i1) -> () +// CHECK: hlfir.copy_out %{{.*}}, %[[VAL_21:.*]]#1 : (!fir.ref>>, i1) -> () // CHECK: hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref, i1 // CHECK: return // CHECK: } diff --git a/flang/test/Lower/Intrinsics/c_f_pointer.f90 b/flang/test/Lower/Intrinsics/c_f_pointer.f90 index c1f1d7972d4b1..f54fda42cf51b 100644 --- a/flang/test/Lower/Intrinsics/c_f_pointer.f90 +++ b/flang/test/Lower/Intrinsics/c_f_pointer.f90 @@ -153,7 +153,6 @@ subroutine dynamic_shape_lower(cptr, fpr, shape, lower) ! CHECK: %[[VAL_2:.*]] = fir.shape %[[C_0]], %[[C_0]] : (index, index) -> !fir.shape<2> ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_1:.*]](%[[VAL_2]]) : (!fir.ptr>, !fir.shape<2>) -> !fir.box>> ! CHECK: fir.store %[[VAL_3]] to %[[VAL_0:.*]] : !fir.ref>>> -! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFdynamic_shape_lowerEn"} ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0:.*]], __address : (!fir.ref>) -> !fir.ref ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> !fir.ptr> diff --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90 index 9eae3a58884fa..f6fae1113b315 100644 --- a/flang/test/Lower/Intrinsics/system_clock.f90 +++ b/flang/test/Lower/Intrinsics/system_clock.f90 @@ -32,11 +32,9 @@ subroutine system_clock_test() ! CHECK-LABEL: @_QPss subroutine ss(count) - ! CHECK: %[[V_0:[0-9]+]] = fir.alloca !fir.box> {bindc_name = "count_max", uniq_name = "_QFssEcount_max"} ! CHECK: %[[V_1:[0-9]+]] = fir.alloca !fir.heap {uniq_name = "_QFssEcount_max.addr"} ! CHECK: %[[V_2:[0-9]+]] = fir.zero_bits !fir.heap ! CHECK: fir.store %[[V_2]] to %[[V_1]] : !fir.ref> - ! CHECK: %[[V_3:[0-9]+]] = fir.alloca !fir.box> {bindc_name = "count_rate", uniq_name = "_QFssEcount_rate"} ! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.ptr {uniq_name = "_QFssEcount_rate.addr"} ! CHECK: %[[V_5:[0-9]+]] = fir.zero_bits !fir.ptr ! CHECK: fir.store %[[V_5]] to %[[V_4]] : !fir.ref> diff --git a/flang/test/Lower/allocatables.f90 b/flang/test/Lower/allocatables.f90 index e62f92fa0c1c7..60b7de3301c48 100644 --- a/flang/test/Lower/allocatables.f90 +++ b/flang/test/Lower/allocatables.f90 @@ -56,7 +56,7 @@ subroutine foodim1() ! CHECK-DAG: fir.load %[[xAddrVar]] : !fir.ref>> deallocate(x) - ! CHECK: %[[xAddr1:.*]] = fir.load %1 : !fir.ref>> + ! CHECK: %[[xAddr1:.*]] = fir.load %{{.*}} : !fir.ref>> ! CHECK: fir.freemem %[[xAddr1]] ! CHECK: %[[nullAddr1:.*]] = fir.zero_bits !fir.heap> ! CHECK: fir.store %[[nullAddr1]] to %[[xAddrVar]] : !fir.ref>> @@ -67,10 +67,6 @@ subroutine foodim2() ! Test lowering of local allocatable specification real, allocatable :: x(:, :) ! CHECK-DAG: fir.alloca !fir.heap> {{{.*}}uniq_name = "_QFfoodim2Ex.addr"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb0"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext0"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb1"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext1"} end subroutine ! test lowering of character allocatables. Focus is placed on the length handling diff --git a/flang/test/Lower/character-local-variables.f90 b/flang/test/Lower/character-local-variables.f90 index d5b959eca1ff6..6325229993a25 100644 --- a/flang/test/Lower/character-local-variables.f90 +++ b/flang/test/Lower/character-local-variables.f90 @@ -8,6 +8,7 @@ subroutine scalar_cst_len() character(10) :: c ! CHECK: fir.alloca !fir.char<1,10> {{{.*}}uniq_name = "_QFscalar_cst_lenEc"} + print *, c end subroutine ! CHECK-LABEL: func @_QPscalar_dyn_len @@ -19,12 +20,14 @@ subroutine scalar_dyn_len(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32 ! CHECK: fir.alloca !fir.char<1,?>(%[[l]] : i32) {{{.*}}uniq_name = "_QFscalar_dyn_lenEc"} + print *, c end subroutine ! CHECK-LABEL: func @_QPcst_array_cst_len subroutine cst_array_cst_len() character(10) :: c(20) ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPcst_array_dyn_len @@ -36,6 +39,7 @@ subroutine cst_array_dyn_len(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32 ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i32) {{{.*}}uniq_name = "_QFcst_array_dyn_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPdyn_array_cst_len @@ -48,6 +52,7 @@ subroutine dyn_array_cst_len(n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_lenEc"} + print *, c(1) end subroutine ! CHECK: func @_QPdyn_array_dyn_len @@ -63,12 +68,14 @@ subroutine dyn_array_dyn_len(l, n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>(%[[l]] : i32), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPcst_array_cst_len_lb subroutine cst_array_cst_len_lb() character(10) :: c(11:30) ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPcst_array_dyn_len_lb @@ -80,6 +87,7 @@ subroutine cst_array_dyn_len_lb(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i64 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i64 ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i64) {{{.*}}uniq_name = "_QFcst_array_dyn_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPdyn_array_cst_len_lb @@ -94,6 +102,7 @@ subroutine dyn_array_cst_len_lb(n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPdyn_array_dyn_len_lb @@ -111,6 +120,7 @@ subroutine dyn_array_dyn_len_lb(l, n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>(%[[l]] : i64), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_len_lbEc"} + print *, c(11) end subroutine ! Test that the length of assumed length parameter is correctly deduced in lowering. @@ -129,4 +139,5 @@ subroutine assumed_length_param(n) subroutine scalar_cst_neg_len() character(-1) :: c ! CHECK: fir.alloca !fir.char<1,0> {{{.*}}uniq_name = "_QFscalar_cst_neg_lenEc"} + print *, c end subroutine diff --git a/flang/test/Lower/derived-types.f90 b/flang/test/Lower/derived-types.f90 index 4d36a7632b070..7e36ec0cfe93f 100644 --- a/flang/test/Lower/derived-types.f90 +++ b/flang/test/Lower/derived-types.f90 @@ -35,6 +35,8 @@ subroutine local_derived() ! CHECK-DAG: fir.alloca !fir.type<_QMdTr{x:f32}> type(r) :: some_r type(c2) :: some_c2 + print *, some_c2%ch_array(1,1) + print *, some_r%x end subroutine ! CHECK-LABEL: func @_QMdPsaved_derived( diff --git a/flang/test/Lower/do_loop_unstructured.f90 b/flang/test/Lower/do_loop_unstructured.f90 index 3b03850b43bb2..9c7d874a1aac8 100644 --- a/flang/test/Lower/do_loop_unstructured.f90 +++ b/flang/test/Lower/do_loop_unstructured.f90 @@ -235,6 +235,7 @@ subroutine nested_structured_in_unstructured() subroutine unstructured_do_concurrent logical :: success do concurrent (i=1:10) local(success) + success = .false. error stop "fail" enddo end diff --git a/flang/test/Lower/forall/array-pointer.f90 b/flang/test/Lower/forall/array-pointer.f90 index fd3efed736c39..6b8c5648af29e 100644 --- a/flang/test/Lower/forall/array-pointer.f90 +++ b/flang/test/Lower/forall/array-pointer.f90 @@ -318,7 +318,6 @@ end subroutine s2_3 ! CHECK-LABEL: func @_QPs2_3( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box>}>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} -! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box>> {bindc_name = "y", fir.target, uniq_name = "_QFs2_3Ey"} ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.heap> {uniq_name = "_QFs2_3Ey.addr"} ! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.lb0"} ! CHECK: %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.ext0"} diff --git a/flang/test/Lower/forall/forall-allocatable.f90 b/flang/test/Lower/forall/forall-allocatable.f90 index 96cd37ea3ed8a..8e54d282aea4b 100644 --- a/flang/test/Lower/forall/forall-allocatable.f90 +++ b/flang/test/Lower/forall/forall-allocatable.f90 @@ -13,20 +13,19 @@ end subroutine forall_with_allocatable ! CHECK-LABEL: func @_QPforall_with_allocatable( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box>{{.*}}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} -! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box>> {bindc_name = "arr", uniq_name = "_QFforall_with_allocatableEarr"} -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.heap> {uniq_name = "_QFforall_with_allocatableEarr.addr"} -! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"} -! CHECK: %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"} -! CHECK: %[[VAL_6:.*]] = fir.zero_bits !fir.heap> -! CHECK: fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref>> +! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.heap> {uniq_name = "_QFforall_with_allocatableEarr.addr"} +! CHECK: %[[VAL_3:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"} +! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"} +! CHECK: %[[VAL_5:.*]] = fir.zero_bits !fir.heap> +! CHECK: fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref>> ! CHECK: %[[VAL_7:.*]] = arith.constant 5 : i32 ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index ! CHECK: %[[VAL_9:.*]] = arith.constant 15 : i32 ! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index ! CHECK: %[[VAL_11:.*]] = arith.constant 1 : index -! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_4]] : !fir.ref -! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_5]] : !fir.ref -! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_3]] : !fir.ref>> +! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_3]] : !fir.ref +! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_4]] : !fir.ref +! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_2]] : !fir.ref>> ! CHECK: %[[VAL_15:.*]] = fir.shape_shift %[[VAL_12]], %[[VAL_13]] : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_16:.*]] = fir.array_load %[[VAL_14]](%[[VAL_15]]) : (!fir.heap>, !fir.shapeshift<1>) -> !fir.array ! CHECK: %[[VAL_17:.*]] = fir.array_load %[[VAL_0]] : (!fir.box>) -> !fir.array diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90 index 2fea84b03891a..5ee6562733dae 100644 --- a/flang/test/Lower/loops.f90 +++ b/flang/test/Lower/loops.f90 @@ -90,7 +90,6 @@ subroutine lis(n) ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}}, %{{.*}} {bindc_name = "a", fir.target, uniq_name = "_QFlisEa"} ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "r", uniq_name = "_QFlisEr"} ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "s", uniq_name = "_QFlisEs"} - ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "t", uniq_name = "_QFlisEt"} integer, target :: a(n,n,n) ! operand via p integer :: r(n,n) ! result, unspecified locality integer :: s(n,n) ! shared locality diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90 index f586380e653a0..bc4eed54282df 100644 --- a/flang/test/Lower/polymorphic.f90 +++ b/flang/test/Lower/polymorphic.f90 @@ -287,7 +287,6 @@ subroutine pointer_assign_parent(p) ! First test is here to have a reference with non polymorphic on both sides. ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_parent( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref> {fir.bindc_name = "p", fir.target}) { -! CHECK: %[[TP:.*]] = fir.alloca !fir.box>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp"} ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr> {uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp.addr"} ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr> ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref>> @@ -302,7 +301,6 @@ subroutine pointer_assign_non_poly(p) ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_non_poly( ! CHECK-SAME: %arg0: !fir.class> {fir.bindc_name = "p", fir.target}) { -! CHECK: %[[TP:.*]] = fir.alloca !fir.box>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp"} ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr> {uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp.addr"} ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr> ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref>> @@ -1103,11 +1101,9 @@ subroutine class_with_entry(a) ! CHECK-LABEL: func.func @_QMpolymorphic_testPclass_with_entry( ! CHECK-SAME: %[[A:.*]]: !fir.class> {fir.bindc_name = "a"}) { -! CHECK: %[[B:.*]] = fir.alloca !fir.class> {bindc_name = "b", uniq_name = "_QMpolymorphic_testFclass_with_entryEb"} ! CHECK-LABEL: func.func @_QMpolymorphic_testPd( ! CHECK-SAME: %[[B:.*]]: !fir.class> {fir.bindc_name = "b"}) { -! CHECK: %[[A:.*]] = fir.alloca !fir.class> {bindc_name = "a", uniq_name = "_QMpolymorphic_testFclass_with_entryEa"} subroutine class_array_with_entry(a) class(p1) :: a(:), b(:) diff --git a/flang/test/Lower/statement-function.f90 b/flang/test/Lower/statement-function.f90 index cfec06c35baa8..fe07649e669af 100644 --- a/flang/test/Lower/statement-function.f90 +++ b/flang/test/Lower/statement-function.f90 @@ -129,7 +129,6 @@ integer function test_stmt_character_with_different_length_2(c, n) character(n) :: argc character(*) :: c ! CHECK: %[[unboxed:.*]]:2 = fir.unboxchar %[[arg0]] : - ! CHECK: fir.load %[[arg1]] : !fir.ref ! CHECK: %[[n:.*]] = fir.load %[[arg1]] : !fir.ref ! CHECK: %[[n_is_positive:.*]] = arith.cmpi sgt, %[[n]], %c0{{.*}} : i32 ! CHECK: %[[len:.*]] = arith.select %[[n_is_positive]], %[[n]], %c0{{.*}} : i32 diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir index 4a417ed981ab1..25fc73153003a 100644 --- a/flang/test/Transforms/stack-arrays.fir +++ b/flang/test/Transforms/stack-arrays.fir @@ -3,13 +3,17 @@ // Simplest transformation func.func @simple() { %0 = fir.allocmem !fir.array<42xi32> + %c0_s = arith.constant 0 : index + %c0_i32_s = arith.constant 0 : i32 + %ref_s = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt_s = fir.coordinate_of %ref_s, %c0_s : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_s to %elt_s : !fir.ref fir.freemem %0 : !fir.heap> return } -// CHECK: func.func @simple() { -// CHECK-NEXT: fir.alloca !fir.array<42xi32> -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK: func.func @simple() +// CHECK: fir.alloca !fir.array<42xi32> +// CHECK: return // Check fir.must_be_heap allocations are not moved func.func @must_be_heap() { @@ -17,7 +21,7 @@ func.func @must_be_heap() { fir.freemem %0 : !fir.heap> return } -// CHECK: func.func @must_be_heap() { +// CHECK-LABEL: func.func @must_be_heap() // CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true} // CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap> // CHECK-NEXT: return @@ -36,7 +40,7 @@ func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { } return } -// CHECK: func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { +// CHECK-LABEL: func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) // CHECK-NEXT: %[[C42:.*]] = arith.constant 42 : index // CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array, %[[C42]] {uniq_name = "_QFdfa1Earr.alloc"} // CHECK-NEXT: %[[LOGICAL:.*]] = fir.load %arg0 : !fir.ref> @@ -57,7 +61,7 @@ func.func @dfa2(%arg0: i1) { } return } -// CHECK: func.func @dfa2(%arg0: i1) { +// CHECK-LABEL: func.func @dfa2(%arg0: i1) // CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array<1xi8> // CHECK-NEXT: scf.if %arg0 { // CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> @@ -74,15 +78,16 @@ func.func @dfa3(%arg0: i1) { } else { fir.freemem %a : !fir.heap> } + %c0_d3 = arith.constant 0 : index + %c0_i8_d3 = arith.constant 0 : i8 + %ref_d3 = fir.convert %a : (!fir.heap>) -> !fir.ref> + %elt_d3 = fir.coordinate_of %ref_d3, %c0_d3 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i8_d3 to %elt_d3 : !fir.ref return } -// CHECK: func.func @dfa3(%arg0: i1) { -// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> -// CHECK-NEXT: fir.if %arg0 { -// CHECK-NEXT: } else { -// CHECK-NEXT: } -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK: func.func @dfa3(%arg0: i1) +// CHECK: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> +// CHECK: return func.func private @dfa3a_foo(!fir.ref>) -> () func.func private @dfa3a_bar(!fir.ref>) -> () @@ -101,7 +106,7 @@ func.func @dfa3a(%arg0: i1) { } return } -// CHECK: func.func @dfa3a(%arg0: i1) { +// CHECK-LABEL: func.func @dfa3a(%arg0: i1) // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> // CHECK-NEXT: %[[HEAP:.*]] = fir.convert %[[MEM]] : (!fir.ref>) -> !fir.heap> // CHECK-NEXT: fir.if %arg0 { @@ -123,13 +128,18 @@ func.func @placement1() { // operand is now available %4 = fir.allocmem !fir.array, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref1 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt1 = fir.coordinate_of %ref1, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32 to %elt1 : !fir.ref fir.freemem %4 : !fir.heap> return } -// CHECK: func.func @placement1() { +// CHECK-LABEL: func.func @placement1() // CHECK-NEXT: %[[ARG:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[ARG]] -// CHECK-NEXT: return +// CHECK: return // CHECK-NEXT: } // check that if there are no operands, then the alloca is placed early @@ -140,16 +150,21 @@ func.func @placement2() { %3 = arith.addi %1, %2 : index %4 = fir.allocmem !fir.array<42xi32> // ... + %c0_p2 = arith.constant 0 : index + %c0_i32_p2 = arith.constant 0 : i32 + %ref_p2 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt_p2 = fir.coordinate_of %ref_p2, %c0_p2 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_p2 to %elt_p2 : !fir.ref fir.freemem %4 : !fir.heap> return } -// CHECK: func.func @placement2() { -// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> -// CHECK-NEXT: %[[ONE:.*]] = arith.constant 1 : index -// CHECK-NEXT: %[[TWO:.*]] = arith.constant 2 : index -// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK-LABEL: func.func @placement2() +// CHECK: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> +// CHECK: %[[ONE:.*]] = arith.constant 1 : index +// CHECK: %[[TWO:.*]] = arith.constant 2 : index +// CHECK: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index +// CHECK: return +// CHECK: } // check that stack allocations which must be placed in loops use stacksave func.func @placement3() { @@ -162,12 +177,17 @@ func.func @placement3() { // operand is now available %4 = fir.allocmem !fir.array, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref2 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt2 = fir.coordinate_of %ref2, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32 to %elt2 : !fir.ref fir.freemem %4 : !fir.heap> fir.result %3, %c1_i32 : index, i32 } return } -// CHECK: func.func @placement3() { +// CHECK-LABEL: func.func @placement3() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -176,7 +196,7 @@ func.func @placement3() { // CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index // CHECK-NEXT: %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[SUM]] -// CHECK-NEXT: llvm.intr.stackrestore %[[SP]] : !llvm.ptr +// CHECK: llvm.intr.stackrestore %[[SP]] : !llvm.ptr // CHECK-NEXT: fir.result // CHECK-NEXT: } // CHECK-NEXT: return @@ -194,12 +214,17 @@ func.func @placement4(%arg0 : i1) { // operand is now available %4 = fir.allocmem !fir.array, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref3 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt3 = fir.coordinate_of %ref3, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32 to %elt3 : !fir.ref fir.freemem %4 : !fir.heap> cf.cond_br %arg0, ^bb1, ^bb2 ^bb2: return } -// CHECK: func.func @placement4(%arg0: i1) { +// CHECK-LABEL: func.func @placement4(%arg0: i1) // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index @@ -208,7 +233,7 @@ func.func @placement4(%arg0 : i1) { // CHECK-NEXT: %[[C3:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[C3]] -// CHECK-NEXT: llvm.intr.stackrestore %[[SP]] : !llvm.ptr +// CHECK: llvm.intr.stackrestore %[[SP]] : !llvm.ptr // CHECK-NEXT: cf.cond_br %arg0, ^bb1, ^bb2 // CHECK-NEXT: ^bb2: // CHECK-NEXT: return @@ -230,7 +255,7 @@ func.func @placement5() { } return } -// CHECK: func.func @placement5() { +// CHECK-LABEL: func.func @placement5() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -268,7 +293,7 @@ func.func @placement6(%arg0: i1) { fir.freemem %4 : !fir.heap> cf.br ^bb1 } -// CHECK: func.func @placement6(%arg0: i1) { +// CHECK-LABEL: func.func @placement6(%arg0: i1) // CHECK-NEXT: %[[c1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[c1_i32:.*]] = fir.convert %[[c1]] : (index) -> i32 // CHECK-NEXT: %[[c2:.*]] = arith.constant 2 : index @@ -289,6 +314,11 @@ func.func @placement6(%arg0: i1) { // Check multiple returns, where the memory is always freed func.func @returns(%arg0: i1) { %0 = fir.allocmem !fir.array<42xi32> + %c0_ret = arith.constant 0 : index + %c0_i32_ret = arith.constant 0 : i32 + %ref_ret = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt_ret = fir.coordinate_of %ref_ret, %c0_ret : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_ret to %elt_ret : !fir.ref cf.cond_br %arg0, ^bb1, ^bb2 ^bb1: fir.freemem %0 : !fir.heap> @@ -297,9 +327,9 @@ func.func @returns(%arg0: i1) { fir.freemem %0 : !fir.heap> return } -// CHECK: func.func @returns(%[[COND:.*]]: i1) { -// CHECK-NEXT: %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32> -// CHECK-NEXT: cf.cond_br %[[COND]], ^bb1, ^bb2 +// CHECK-LABEL: func.func @returns( +// CHECK: %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32> +// CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 // CHECK-NEXT: ^bb1: // CHECK-NEXT: return // CHECK-NEXT: ^bb2: @@ -309,6 +339,11 @@ func.func @returns(%arg0: i1) { // Check multiple returns, where the memory is not freed on one branch func.func @returns2(%arg0: i1) { %0 = fir.allocmem !fir.array<42xi32> + %c0_ret2 = arith.constant 0 : index + %c0_i32_ret2 = arith.constant 0 : i32 + %ref_ret2 = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt_ret2 = fir.coordinate_of %ref_ret2, %c0_ret2 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_ret2 to %elt_ret2 : !fir.ref cf.cond_br %arg0, ^bb1, ^bb2 ^bb1: fir.freemem %0 : !fir.heap> @@ -316,9 +351,9 @@ func.func @returns2(%arg0: i1) { ^bb2: return } -// CHECK: func.func @returns2(%[[COND:.*]]: i1) { -// CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> -// CHECK-NEXT: cf.cond_br %[[COND]], ^bb1, ^bb2 +// CHECK-LABEL: func.func @returns2( +// CHECK: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> +// CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 // CHECK-NEXT: ^bb1: // CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap> // CHECK-NEXT: return @@ -338,7 +373,7 @@ func.func @omp_placement1() { } return } -// CHECK: func.func @omp_placement1() { +// CHECK-LABEL: func.func @omp_placement1() // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> // CHECK-NEXT: %[[MEM_CONV:.*]] = fir.convert %[[MEM]] : (!fir.ref>) -> !fir.heap> // CHECK-NEXT: omp.sections { @@ -353,19 +388,21 @@ func.func @omp_placement1() { // function terminated by stop statement func.func @stop_terminator() { %0 = fir.allocmem !fir.array<42xi32> + %c0 = arith.constant 0 : index + %c0_i32_st = arith.constant 0 : i32 + %ref4 = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt4 = fir.coordinate_of %ref4, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_st to %elt4 : !fir.ref fir.freemem %0 : !fir.heap> %c0_i32 = arith.constant 0 : i32 %false = arith.constant false fir.call @_FortranAStopStatement(%c0_i32, %false, %false) : (i32, i1, i1) -> () fir.unreachable } -// CHECK: func.func @stop_terminator() { -// CHECK-NEXT: fir.alloca !fir.array<42xi32> -// CHECK-NEXT: %[[ZERO:.*]] = arith.constant 0 : i32 -// CHECK-NEXT: %[[FALSE:.*]] = arith.constant false -// CHECK-NEXT: fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> () -// CHECK-NEXT: fir.unreachable -// CHECK-NEXT: } +// CHECK-LABEL: func.func @stop_terminator() +// CHECK: fir.alloca !fir.array<42xi32> +// CHECK: fir.call @_FortranAStopStatement( +// CHECK: fir.unreachable // check that stack allocations that use fir.declare which must be placed in loops @@ -387,7 +424,7 @@ func.func @placement_loop_declare() { } return } -// CHECK: func.func @placement_loop_declare() { +// CHECK-LABEL: func.func @placement_loop_declare() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -415,7 +452,7 @@ func.func @lookthrough() { fir.freemem %4 : !fir.heap> return } -// CHECK: func.func @lookthrough() { +// CHECK-LABEL: func.func @lookthrough() // CHECK: fir.alloca !fir.array<42xi32> // CHECK-NOT: fir.freemem @@ -457,6 +494,6 @@ func.func @finding_freemem_in_block() { ^bb3: // pred: ^bb1 return } -// CHECK: func.func @finding_freemem_in_block() { +// CHECK-LABEL: func.func @finding_freemem_in_block() // CHECK: fir.alloca !fir.array // CHECK-NOT: fir.freemem From ad9eb0d01649d82b17b68fa66fac2ae0a31744db Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Tue, 11 Nov 2025 01:36:25 +0300 Subject: [PATCH 25/30] Add default empty header filter regex to root .clang-tidy (#167386) After https://github.com/llvm/llvm-project/pull/164165, we emit warnings from non-system headers by default. This change only preserves functionality of `clang-tidy` as it was before the change. --- .clang-tidy | 1 + 1 file changed, 1 insertion(+) diff --git a/.clang-tidy b/.clang-tidy index 06bb0f18e9d2e..2cda1b81de808 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,3 +1,4 @@ +HeaderFilterRegex: '' Checks: > -*, clang-diagnostic-*, From 11ab23c33db5c990489023c91260435db288efd8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Nov 2025 14:40:39 -0800 Subject: [PATCH 26/30] CodeGen: Keep reference to TargetRegisterInfo in TargetInstrInfo (#158224) Both conceptually belong to the same subtarget, so it should not be necessary to pass in the context TargetRegisterInfo to any TargetInstrInfo member. Add this reference so those superfluous arguments can be removed. Most targets placed their TargetRegisterInfo as a member in TargetInstrInfo. A few had this owned by the TargetSubtargetInfo, so unify all targets to look the same. --- llvm/include/llvm/CodeGen/TargetInstrInfo.h | 11 ++- llvm/lib/CodeGen/TargetInstrInfo.cpp | 68 ++++++++----------- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/R600InstrInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +- llvm/lib/Target/ARC/ARCInstrInfo.cpp | 3 +- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 5 +- llvm/lib/Target/ARM/ARMBaseInstrInfo.h | 9 ++- llvm/lib/Target/ARM/ARMInstrInfo.cpp | 3 +- llvm/lib/Target/ARM/ARMInstrInfo.h | 2 +- llvm/lib/Target/ARM/Thumb1InstrInfo.cpp | 2 +- llvm/lib/Target/ARM/Thumb1InstrInfo.h | 2 +- llvm/lib/Target/ARM/Thumb2InstrInfo.cpp | 2 +- llvm/lib/Target/ARM/Thumb2InstrInfo.h | 2 +- llvm/lib/Target/AVR/AVRInstrInfo.cpp | 4 +- llvm/lib/Target/BPF/BPFInstrInfo.cpp | 2 +- llvm/lib/Target/CSKY/CSKYInstrInfo.cpp | 2 +- llvm/lib/Target/DirectX/DirectXInstrInfo.cpp | 2 +- llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp | 4 +- llvm/lib/Target/Hexagon/HexagonInstrInfo.h | 5 ++ llvm/lib/Target/Hexagon/HexagonSubtarget.cpp | 3 +- llvm/lib/Target/Hexagon/HexagonSubtarget.h | 3 +- llvm/lib/Target/Lanai/LanaiInstrInfo.cpp | 3 +- .../Target/LoongArch/LoongArchInstrInfo.cpp | 4 +- .../lib/Target/LoongArch/LoongArchInstrInfo.h | 4 ++ .../Target/LoongArch/LoongArchSubtarget.cpp | 2 +- .../lib/Target/LoongArch/LoongArchSubtarget.h | 3 +- llvm/lib/Target/MSP430/MSP430InstrInfo.cpp | 3 +- llvm/lib/Target/Mips/Mips16InstrInfo.cpp | 6 +- llvm/lib/Target/Mips/Mips16InstrInfo.h | 2 +- llvm/lib/Target/Mips/MipsInstrInfo.cpp | 5 +- llvm/lib/Target/Mips/MipsInstrInfo.h | 8 ++- llvm/lib/Target/Mips/MipsSEInstrInfo.cpp | 6 +- llvm/lib/Target/Mips/MipsSEInstrInfo.h | 2 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp | 2 +- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 2 +- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 5 +- llvm/lib/Target/RISCV/RISCVInstrInfo.h | 3 + llvm/lib/Target/RISCV/RISCVSubtarget.cpp | 2 +- llvm/lib/Target/RISCV/RISCVSubtarget.h | 3 +- llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp | 2 +- llvm/lib/Target/Sparc/SparcInstrInfo.cpp | 4 +- llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 2 +- llvm/lib/Target/VE/VEInstrInfo.cpp | 2 +- .../WebAssembly/WebAssemblyInstrInfo.cpp | 2 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 2 +- llvm/lib/Target/XCore/XCoreInstrInfo.cpp | 2 +- llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp | 3 +- llvm/unittests/CodeGen/MFCommon.inc | 4 +- llvm/utils/TableGen/InstrInfoEmitter.cpp | 9 ++- 50 files changed, 126 insertions(+), 112 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 7010cffe23a11..02cd9256c422a 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -113,15 +113,18 @@ struct ExtAddrMode { /// class LLVM_ABI TargetInstrInfo : public MCInstrInfo { protected: + const TargetRegisterInfo &TRI; + /// Subtarget specific sub-array of MCInstrInfo's RegClassByHwModeTables /// (i.e. the table for the active HwMode). This should be indexed by /// MCOperandInfo's RegClass field for LookupRegClassByHwMode operands. const int16_t *const RegClassByHwMode; - TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u, - unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u, + TargetInstrInfo(const TargetRegisterInfo &TRI, unsigned CFSetupOpcode = ~0u, + unsigned CFDestroyOpcode = ~0u, unsigned CatchRetOpcode = ~0u, + unsigned ReturnOpcode = ~0u, const int16_t *const RegClassByHwModeTable = nullptr) - : RegClassByHwMode(RegClassByHwModeTable), + : TRI(TRI), RegClassByHwMode(RegClassByHwModeTable), CallFrameSetupOpcode(CFSetupOpcode), CallFrameDestroyOpcode(CFDestroyOpcode), CatchRetOpcode(CatchRetOpcode), ReturnOpcode(ReturnOpcode) {} @@ -131,6 +134,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { TargetInstrInfo &operator=(const TargetInstrInfo &) = delete; virtual ~TargetInstrInfo(); + const TargetRegisterInfo &getRegisterInfo() const { return TRI; } + static bool isGenericOpcode(unsigned Opc) { return Opc <= TargetOpcode::GENERIC_OP_END; } diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 3c41bbeb4b327..7c89e51ff86d3 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -60,7 +60,7 @@ TargetInstrInfo::~TargetInstrInfo() = default; const TargetRegisterClass * TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { + const TargetRegisterInfo * /*RemoveMe*/) const { if (OpNum >= MCID.getNumOperands()) return nullptr; @@ -69,14 +69,14 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, // TODO: Remove isLookupPtrRegClass in favor of isLookupRegClassByHwMode if (OpInfo.isLookupPtrRegClass()) - return TRI->getPointerRegClass(RegClass); + return TRI.getPointerRegClass(RegClass); // Instructions like INSERT_SUBREG do not have fixed register classes. if (RegClass < 0) return nullptr; // Otherwise just look it up normally. - return TRI->getRegClass(RegClass); + return TRI.getRegClass(RegClass); } /// insertNoop - Insert a noop into the instruction stream at the specified @@ -223,13 +223,11 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI, // %1.sub = INST %1.sub(tied), %0.sub, implicit-def %1 SmallVector UpdateImplicitDefIdx; if (HasDef && MI.hasImplicitDef()) { - const TargetRegisterInfo *TRI = - MI.getMF()->getSubtarget().getRegisterInfo(); for (auto [OpNo, MO] : llvm::enumerate(MI.implicit_operands())) { Register ImplReg = MO.getReg(); if ((ImplReg.isVirtual() && ImplReg == Reg0) || (ImplReg.isPhysical() && Reg0.isPhysical() && - TRI->isSubRegisterEq(ImplReg, Reg0))) + TRI.isSubRegisterEq(ImplReg, Reg0))) UpdateImplicitDefIdx.push_back(OpNo + MI.getNumExplicitOperands()); } } @@ -425,37 +423,35 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx, unsigned &Size, unsigned &Offset, const MachineFunction &MF) const { - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!SubIdx) { - Size = TRI->getSpillSize(*RC); + Size = TRI.getSpillSize(*RC); Offset = 0; return true; } - unsigned BitSize = TRI->getSubRegIdxSize(SubIdx); + unsigned BitSize = TRI.getSubRegIdxSize(SubIdx); // Convert bit size to byte size. if (BitSize % 8) return false; - int BitOffset = TRI->getSubRegIdxOffset(SubIdx); + int BitOffset = TRI.getSubRegIdxOffset(SubIdx); if (BitOffset < 0 || BitOffset % 8) return false; Size = BitSize / 8; Offset = (unsigned)BitOffset / 8; - assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); + assert(TRI.getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); if (!MF.getDataLayout().isLittleEndian()) { - Offset = TRI->getSpillSize(*RC) - (Offset + Size); + Offset = TRI.getSpillSize(*RC) - (Offset + Size); } return true; } -void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const { +void TargetInstrInfo::reMaterialize( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, + unsigned SubIdx, const MachineInstr &Orig, + const TargetRegisterInfo & /*Remove me*/) const { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI); MBB.insert(I, MI); @@ -726,7 +722,6 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, // actual load size is. int64_t MemSize = 0; const MachineFrameInfo &MFI = MF.getFrameInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (Flags & MachineMemOperand::MOStore) { MemSize = MFI.getObjectSize(FI); @@ -735,7 +730,7 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, int64_t OpSize = MFI.getObjectSize(FI); if (auto SubReg = MI.getOperand(OpIdx).getSubReg()) { - unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg); + unsigned SubRegSize = TRI.getSubRegIdxSize(SubReg); if (SubRegSize > 0 && !(SubRegSize % 8)) OpSize = SubRegSize / 8; } @@ -800,11 +795,11 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, // code. BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO); } else { - storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI, + storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, &TRI, Register()); } } else - loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register()); + loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, &TRI, Register()); return &*--Pos; } @@ -880,8 +875,8 @@ static void transferImplicitOperands(MachineInstr *MI, } } -void TargetInstrInfo::lowerCopy(MachineInstr *MI, - const TargetRegisterInfo *TRI) const { +void TargetInstrInfo::lowerCopy( + MachineInstr *MI, const TargetRegisterInfo * /*Remove me*/) const { if (MI->allDefsAreDead()) { MI->setDesc(get(TargetOpcode::KILL)); return; @@ -911,7 +906,7 @@ void TargetInstrInfo::lowerCopy(MachineInstr *MI, SrcMO.getReg().isPhysical() ? SrcMO.isRenamable() : false); if (MI->getNumOperands() > 2) - transferImplicitOperands(MI, TRI); + transferImplicitOperands(MI, &TRI); MI->eraseFromParent(); } @@ -1327,8 +1322,7 @@ void TargetInstrInfo::reassociateOps( MachineFunction *MF = Root.getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, &TRI); MachineOperand &OpA = Prev.getOperand(OperandIndices[1]); MachineOperand &OpB = Root.getOperand(OperandIndices[2]); @@ -1704,8 +1698,7 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // stack slot reference to depend on the instruction that does the // modification. const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI); + return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), &TRI); } // Provide a global flag for disabling the PreRA hazard recognizer that targets @@ -1738,11 +1731,11 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, // Default implementation of getMemOperandWithOffset. bool TargetInstrInfo::getMemOperandWithOffset( const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, - bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { + bool &OffsetIsScalable, const TargetRegisterInfo * /*RemoveMe*/) const { SmallVector BaseOps; LocationSize Width = LocationSize::precise(0); if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable, - Width, TRI) || + Width, &TRI) || BaseOps.size() != 1) return false; BaseOp = BaseOps.front(); @@ -1863,7 +1856,6 @@ std::optional TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const { const MachineFunction *MF = MI.getMF(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {}); int64_t Offset; bool OffsetIsScalable; @@ -1894,7 +1886,6 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, // Only describe memory which provably does not escape the function. As // described in llvm.org/PR43343, escaped memory may be clobbered by the // callee (or by another thread). - const auto &TII = MF->getSubtarget().getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); const MachineMemOperand *MMO = MI.memoperands()[0]; const PseudoSourceValue *PSV = MMO->getPseudoValue(); @@ -1905,8 +1896,7 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, return std::nullopt; const MachineOperand *BaseOp; - if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, - TRI)) + if (!getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, &TRI)) return std::nullopt; // FIXME: Scalable offsets are not yet handled in the offset code below. @@ -2045,7 +2035,7 @@ bool TargetInstrInfo::getInsertSubregInputs( // Returns a MIRPrinter comment for this machine operand. std::string TargetInstrInfo::createMIROperandComment( const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx, - const TargetRegisterInfo *TRI) const { + const TargetRegisterInfo * /*RemoveMe*/) const { if (!MI.isInlineAsm()) return ""; @@ -2078,12 +2068,8 @@ std::string TargetInstrInfo::createMIROperandComment( OS << F.getKindName(); unsigned RCID; - if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) { - if (TRI) { - OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID)); - } else - OS << ":RC" << RCID; - } + if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) + OS << ':' << TRI.getRegClassName(TRI.getRegClass(RCID)); if (F.isMemKind()) { InlineAsm::ConstraintCode MCID = F.getMemoryConstraintID(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5e95807a68199..66e49491b18e0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -91,7 +91,7 @@ static cl::opt GatherOptSearchLimit( "machine-combiner gather pattern optimization")); AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) - : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN, + : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, AArch64::CATCHRET), RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {} diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 3e256cce97afb..01040854e1577 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -29,7 +29,7 @@ using namespace llvm; #include "R600GenInstrInfo.inc" R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) - : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {} + : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9c74c654d8e35..6b397e0834686 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,7 +63,8 @@ static cl::opt Fix16BitCopies( cl::ReallyHidden); SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) - : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP, + AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { SchedModel.init(&ST); } diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp index 05bcb3596ac48..2dec6ffd89a75 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp +++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp @@ -44,7 +44,8 @@ enum TSFlagsConstants { void ARCInstrInfo::anchor() {} ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST) - : ARCGenInstrInfo(ST, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {} + : ARCGenInstrInfo(ST, RI, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), + RI(ST) {} static bool isZeroImm(const MachineOperand &Op) { return Op.isImm() && Op.getImm() == 0; diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 22769dbf38719..b466ca6f19b3e 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -107,8 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = { { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true }, }; -ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI) - : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), +ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI, + const ARMBaseRegisterInfo &TRI) + : ARMGenInstrInfo(STI, TRI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), Subtarget(STI) { for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) { if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 2869e7f708046..27f8e3b3170e7 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -44,7 +44,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { protected: // Can be only subclassed. - explicit ARMBaseInstrInfo(const ARMSubtarget &STI); + explicit ARMBaseInstrInfo(const ARMSubtarget &STI, + const ARMBaseRegisterInfo &TRI); void expandLoadStackGuardBase(MachineBasicBlock::iterator MI, unsigned LoadImmOpc, unsigned LoadOpc) const; @@ -125,7 +126,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { // if there is not such an opcode. virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0; - virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0; + const ARMBaseRegisterInfo &getRegisterInfo() const { + return static_cast( + TargetInstrInfo::getRegisterInfo()); + } + const ARMSubtarget &getSubtarget() const { return Subtarget; } ScheduleHazardRecognizer * diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp index c684de7252e5d..f37054736b730 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -25,7 +25,8 @@ #include "llvm/MC/MCInst.h" using namespace llvm; -ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {} +ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI, RI) {} /// Return the noop instruction to use for a noop. MCInst ARMInstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.h b/llvm/lib/Target/ARM/ARMInstrInfo.h index 178d7a2c630e4..9feaf1440f2b2 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -35,7 +35,7 @@ class ARMInstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ARMRegisterInfo &getRegisterInfo() const override { return RI; } + const ARMRegisterInfo &getRegisterInfo() const { return RI; } private: void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override; diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 4b8c2fd569ead..f95ba6a4d86fb 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) {} + : ARMBaseInstrInfo(STI, RI), RI(STI) {} /// Return the noop instruction to use for a noop. MCInst Thumb1InstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h index 68b326c0ebef6..16350a65a6198 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -35,7 +35,7 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const { return RI; } void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index f5653d459eac8..b66e4075d4cfa 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -46,7 +46,7 @@ PreferNoCSEL("prefer-no-csel", cl::Hidden, cl::init(false)); Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) {} + : ARMBaseInstrInfo(STI, RI), RI(STI) {} /// Return the noop instruction to use for a noop. MCInst Thumb2InstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 1b0bf2d499510..59ef39d442aef 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -58,7 +58,7 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const { return RI; } MachineInstr *optimizeSelect(MachineInstr &MI, SmallPtrSetImpl &SeenMIs, diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp index ce9908597dcac..5e247cb64a991 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp +++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp @@ -30,8 +30,8 @@ namespace llvm { AVRInstrInfo::AVRInstrInfo(const AVRSubtarget &STI) - : AVRGenInstrInfo(STI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(), - STI(STI) {} + : AVRGenInstrInfo(STI, RI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), + RI(), STI(STI) {} void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp index 409f8b4c253b8..0e56e658952a2 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp +++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp @@ -27,7 +27,7 @@ using namespace llvm; BPFInstrInfo::BPFInstrInfo(const BPFSubtarget &STI) - : BPFGenInstrInfo(STI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} + : BPFGenInstrInfo(STI, RI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp index 619a797be6dc7..34a7de8d8ae96 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp @@ -25,7 +25,7 @@ using namespace llvm; #include "CSKYGenInstrInfo.inc" CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI) - : CSKYGenInstrInfo(STI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), + : CSKYGenInstrInfo(STI, RI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) { v2sf = STI.hasFPUv2SingleFloat(); v2df = STI.hasFPUv2DoubleFloat(); diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp index bb2efa43d818c..401881d6d0f67 100644 --- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp @@ -19,6 +19,6 @@ using namespace llvm; DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI) - : DirectXGenInstrInfo(STI) {} + : DirectXGenInstrInfo(STI, RI) {} DirectXInstrInfo::~DirectXInstrInfo() {} diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 55bafdea234fd..dd9f2fa5cf342 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -118,9 +118,9 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768; void HexagonInstrInfo::anchor() {} HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST) - : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN, + : HexagonGenInstrInfo(ST, RegInfo, Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP), - Subtarget(ST) {} + RegInfo(ST.getHwMode()), Subtarget(ST) {} namespace llvm { namespace HexagonFUnits { diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index 48adf82833f51..7a0c77cc3f705 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -23,6 +23,8 @@ #include #include +#include "HexagonRegisterInfo.h" + #define GET_INSTRINFO_HEADER #include "HexagonGenInstrInfo.inc" @@ -36,6 +38,7 @@ class MachineOperand; class TargetRegisterInfo; class HexagonInstrInfo : public HexagonGenInstrInfo { + const HexagonRegisterInfo RegInfo; const HexagonSubtarget &Subtarget; enum BundleAttribute { @@ -47,6 +50,8 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { public: explicit HexagonInstrInfo(const HexagonSubtarget &ST); + const HexagonRegisterInfo &getRegisterInfo() const { return RegInfo; } + /// TargetInstrInfo overrides. /// If the specified machine instruction is a direct diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index a3c8a882c0616..66c8b0a67169d 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -76,8 +76,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, OptLevel(TM.getOptLevel()), CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), - RegInfo(getHwMode()), TLInfo(TM, *this), - InstrItins(getInstrItineraryForCPU(CPUString)) { + TLInfo(TM, *this), InstrItins(getInstrItineraryForCPU(CPUString)) { Hexagon_MC::addArchSubtarget(this, FS); // Beware of the default constructor of InstrItineraryData: it will // reset all members to 0. diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 995f66d0551b4..30794f61218a1 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -100,7 +100,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { // The following objects can use the TargetTriple, so they must be // declared after it. HexagonInstrInfo InstrInfo; - HexagonRegisterInfo RegInfo; HexagonTargetLowering TLInfo; HexagonSelectionDAGInfo TSInfo; HexagonFrameLowering FrameLowering; @@ -122,7 +121,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { } const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; } const HexagonRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const HexagonTargetLowering *getTargetLowering() const override { return &TLInfo; diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp index 02ed1001cd0d3..b3d28565d4f06 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -27,7 +27,8 @@ using namespace llvm; #include "LanaiGenInstrInfo.inc" LanaiInstrInfo::LanaiInstrInfo(const LanaiSubtarget &STI) - : LanaiGenInstrInfo(STI, Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP), + : LanaiGenInstrInfo(STI, RegisterInfo, Lanai::ADJCALLSTACKDOWN, + Lanai::ADJCALLSTACKUP), RegisterInfo() {} void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 9a35df2f240c4..5eb3bd6d01a64 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -26,9 +26,9 @@ using namespace llvm; #include "LoongArchGenInstrInfo.inc" LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI) - : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN, + : LoongArchGenInstrInfo(STI, RegInfo, LoongArch::ADJCALLSTACKDOWN, LoongArch::ADJCALLSTACKUP), - STI(STI) {} + RegInfo(STI.getHwMode()), STI(STI) {} MCInst LoongArchInstrInfo::getNop() const { return MCInstBuilder(LoongArch::ANDI) diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index e61314c034bdb..d43d229256c13 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -24,9 +24,13 @@ namespace llvm { class LoongArchSubtarget; class LoongArchInstrInfo : public LoongArchGenInstrInfo { + const LoongArchRegisterInfo RegInfo; + public: explicit LoongArchInstrInfo(const LoongArchSubtarget &STI); + const LoongArchRegisterInfo &getRegisterInfo() const { return RegInfo; } + MCInst getNop() const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp index 3acbe4992273a..76a8ba1c90e50 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp @@ -95,4 +95,4 @@ LoongArchSubtarget::LoongArchSubtarget(const Triple &TT, StringRef CPU, : LoongArchGenSubtargetInfo(TT, CPU, TuneCPU, FS), FrameLowering( initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)), - InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {} + InstrInfo(*this), TLInfo(TM, *this) {} diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h index 5e12bafebb0d5..2beff07949daf 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h @@ -45,7 +45,6 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown; LoongArchFrameLowering FrameLowering; LoongArchInstrInfo InstrInfo; - LoongArchRegisterInfo RegInfo; LoongArchTargetLowering TLInfo; SelectionDAGTargetInfo TSInfo; @@ -78,7 +77,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { } const LoongArchInstrInfo *getInstrInfo() const override { return &InstrInfo; } const LoongArchRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const LoongArchTargetLowering *getTargetLowering() const override { return &TLInfo; diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp index 65b4820752c94..af053b8645b9b 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -26,7 +26,8 @@ using namespace llvm; void MSP430InstrInfo::anchor() {} MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI) - : MSP430GenInstrInfo(STI, MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP), + : MSP430GenInstrInfo(STI, RI, MSP430::ADJCALLSTACKDOWN, + MSP430::ADJCALLSTACKUP), RI() {} void MSP430InstrInfo::storeRegToStackSlot( diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp index aa94f54cdf9a0..69b96cf3fc933 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp @@ -37,11 +37,7 @@ using namespace llvm; #define DEBUG_TYPE "mips16-instrinfo" Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {} - -const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const { - return RI; -} + : MipsInstrInfo(STI, RI, Mips::Bimm16), RI(STI) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.h b/llvm/lib/Target/Mips/Mips16InstrInfo.h index 1058e8c25fb5b..2834fd3d7eec2 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.h +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.h @@ -30,7 +30,7 @@ class Mips16InstrInfo : public MipsInstrInfo { public: explicit Mips16InstrInfo(const MipsSubtarget &STI); - const MipsRegisterInfo &getRegisterInfo() const override; + const Mips16RegisterInfo &getRegisterInfo() const { return RI; } /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp index bffdffa4af6a0..c879c46e49dd4 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -39,8 +39,9 @@ using namespace llvm; // Pin the vtable to this file. void MipsInstrInfo::anchor() {} -MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr) - : MipsGenInstrInfo(STI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), +MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, + const MipsRegisterInfo &RI, unsigned UncondBr) + : MipsGenInstrInfo(STI, RI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), Subtarget(STI), UncondBrOpc(UncondBr) {} const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) { diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h index 2337ae7c079e7..fc9424808756a 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -55,7 +55,8 @@ class MipsInstrInfo : public MipsGenInstrInfo { BT_Indirect // One indirct branch. }; - explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc); + explicit MipsInstrInfo(const MipsSubtarget &STI, const MipsRegisterInfo &RI, + unsigned UncondBrOpc); MCInst getNop() const override; @@ -130,7 +131,10 @@ class MipsInstrInfo : public MipsGenInstrInfo { /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). - virtual const MipsRegisterInfo &getRegisterInfo() const = 0; + const MipsRegisterInfo &getRegisterInfo() const { + return static_cast( + TargetInstrInfo::getRegisterInfo()); + } virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0; diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp index dbdbb179a583d..517f489984c27 100644 --- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -28,11 +28,7 @@ static unsigned getUnconditionalBranch(const MipsSubtarget &STI) { } MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, getUnconditionalBranch(STI)), RI(STI) {} - -const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const { - return RI; -} + : MipsInstrInfo(STI, RI, getUnconditionalBranch(STI)), RI(STI) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/llvm/lib/Target/Mips/MipsSEInstrInfo.h index 2b4f55d184b8b..0a7a0e5ac2a56 100644 --- a/llvm/lib/Target/Mips/MipsSEInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.h @@ -24,7 +24,7 @@ class MipsSEInstrInfo : public MipsInstrInfo { public: explicit MipsSEInstrInfo(const MipsSubtarget &STI); - const MipsRegisterInfo &getRegisterInfo() const override; + const MipsSERegisterInfo &getRegisterInfo() const { return RI; } /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 6840c7ae8faf4..db2d96f5ff532 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -26,7 +26,7 @@ using namespace llvm; void NVPTXInstrInfo::anchor() {} NVPTXInstrInfo::NVPTXInstrInfo(const NVPTXSubtarget &STI) - : NVPTXGenInstrInfo(STI), RegInfo() {} + : NVPTXGenInstrInfo(STI, RegInfo), RegInfo() {} void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 3014aa6bfe31e..8d9d4c7f8e128 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -89,7 +89,7 @@ static cl::opt EnableFMARegPressureReduction( void PPCInstrInfo::anchor() {} PPCInstrInfo::PPCInstrInfo(const PPCSubtarget &STI) - : PPCGenInstrInfo(STI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, + : PPCGenInstrInfo(STI, RI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, /* CatchRetOpcode */ -1, STI.isPPC64() ? PPC::BLR8 : PPC::BLR), Subtarget(STI), RI(STI.getTargetMachine()) {} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index b05956b674d18..ce8dd3ba4b7b3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -82,8 +82,9 @@ namespace llvm::RISCV { } // end namespace llvm::RISCV RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI) - : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP), - STI(STI) {} + : RISCVGenInstrInfo(STI, RegInfo, RISCV::ADJCALLSTACKDOWN, + RISCV::ADJCALLSTACKUP), + RegInfo(STI.getHwMode()), STI(STI) {} #define GET_INSTRINFO_HELPERS #include "RISCVGenInstrInfo.inc" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index c5eddb9e90fbf..800af2621b7df 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -79,10 +79,13 @@ enum RISCVMachineCombinerPattern : unsigned { }; class RISCVInstrInfo : public RISCVGenInstrInfo { + const RISCVRegisterInfo RegInfo; public: explicit RISCVInstrInfo(const RISCVSubtarget &STI); + const RISCVRegisterInfo &getRegisterInfo() const { return RegInfo; } + MCInst getNop() const override; Register isLoadFromStackSlot(const MachineInstr &MI, diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 715ac4cedc649..3b43be3d15743 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -104,7 +104,7 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, RVVVectorBitsMin(RVVVectorBitsMin), RVVVectorBitsMax(RVVVectorBitsMax), FrameLowering( initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)), - InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) { + InstrInfo(*this), TLInfo(TM, *this) { TSInfo = std::make_unique(); } diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4b4fc8f0d8e76..d5ffa6c812cec 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -112,7 +112,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { RISCVFrameLowering FrameLowering; RISCVInstrInfo InstrInfo; - RISCVRegisterInfo RegInfo; RISCVTargetLowering TLInfo; /// Initializes using the passed in CPU and feature strings so that we can @@ -140,7 +139,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { } const RISCVInstrInfo *getInstrInfo() const override { return &InstrInfo; } const RISCVRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const RISCVTargetLowering *getTargetLowering() const override { return &TLInfo; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp index ba95ad822df75..4f8bf4312a380 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; SPIRVInstrInfo::SPIRVInstrInfo(const SPIRVSubtarget &STI) - : SPIRVGenInstrInfo(STI) {} + : SPIRVGenInstrInfo(STI, RI) {} bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const { switch (MI.getOpcode()) { diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index f66eb9dbee2dc..fcd6cd7f262bc 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -38,8 +38,8 @@ static cl::opt void SparcInstrInfo::anchor() {} SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST) - : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(ST), - Subtarget(ST) {} + : SparcGenInstrInfo(ST, RI, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), + RI(ST), Subtarget(ST) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 2e21f27c9032f..23e7e7e314033 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -60,7 +60,7 @@ static uint64_t allOnes(unsigned int Count) { void SystemZInstrInfo::anchor() {} SystemZInstrInfo::SystemZInstrInfo(const SystemZSubtarget &sti) - : SystemZGenInstrInfo(sti, -1, -1), + : SystemZGenInstrInfo(sti, RI, -1, -1), RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(), sti.getHwMode()), STI(sti) {} diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index d5e804afd27fe..bae703bd97ac8 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -35,7 +35,7 @@ using namespace llvm; void VEInstrInfo::anchor() {} VEInstrInfo::VEInstrInfo(const VESubtarget &ST) - : VEGenInstrInfo(ST, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} + : VEGenInstrInfo(ST, RI, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 343d90e88950f..8b4e4fbbbd1e5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -34,7 +34,7 @@ using namespace llvm; #include "WebAssemblyGenInstrInfo.inc" WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) - : WebAssemblyGenInstrInfo(STI, WebAssembly::ADJCALLSTACKDOWN, + : WebAssemblyGenInstrInfo(STI, RI, WebAssembly::ADJCALLSTACKDOWN, WebAssembly::ADJCALLSTACKUP, WebAssembly::CATCHRET), RI(STI.getTargetTriple()) {} diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b2a7a4ec3583..2c6d1af225b6b 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -85,7 +85,7 @@ static cl::opt UndefRegClearance( void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) - : X86GenInstrInfo(STI, + : X86GenInstrInfo(STI, RI, (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index 1a9133aad4dd3..80fda3480aa44 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -43,7 +43,7 @@ namespace XCore { void XCoreInstrInfo::anchor() {} XCoreInstrInfo::XCoreInstrInfo(const XCoreSubtarget &ST) - : XCoreGenInstrInfo(ST, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), + : XCoreGenInstrInfo(ST, RI, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), RI() {} static bool isZeroImm(const MachineOperand &op) { diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index be69cefb5b78f..6bbebdecd9988 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -48,7 +48,8 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) { } XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI) - : XtensaGenInstrInfo(STI, Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP), + : XtensaGenInstrInfo(STI, RI, Xtensa::ADJCALLSTACKDOWN, + Xtensa::ADJCALLSTACKUP), RI(STI), STI(STI) {} Register XtensaInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 0180ba0a6c163..783267aa05b59 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -76,8 +76,10 @@ public: }; class BogusTargetInstrInfo : public TargetInstrInfo { + BogusRegisterInfo RegInfo; + public: - BogusTargetInstrInfo() : TargetInstrInfo() {} + BogusTargetInstrInfo() : TargetInstrInfo(RegInfo) {} }; class BogusSubtarget : public TargetSubtargetInfo { diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index ee3cd8c20f8e7..32994c12aa98b 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -1103,7 +1103,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) { Twine ClassName = TargetName + "GenInstrInfo"; OS << "struct " << ClassName << " : public TargetInstrInfo {\n" << " explicit " << ClassName - << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode = ~0u, " + << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, " + "unsigned CFSetupOpcode = ~0u, " "unsigned CFDestroyOpcode = ~0u, " "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n" << " ~" << ClassName << "() override = default;\n" @@ -1157,9 +1158,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) { << TargetName << "InstrComplexDeprecationInfos[];\n"; Twine ClassName = TargetName + "GenInstrInfo"; OS << ClassName << "::" << ClassName - << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode, unsigned " + << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, " + "unsigned CFSetupOpcode, unsigned " "CFDestroyOpcode, unsigned CatchRetOpcode, unsigned ReturnOpcode)\n" - << " : TargetInstrInfo(CFSetupOpcode, CFDestroyOpcode, CatchRetOpcode, " + << " : TargetInstrInfo(TRI, CFSetupOpcode, CFDestroyOpcode, " + "CatchRetOpcode, " "ReturnOpcode"; if (NumClassesByHwMode != 0) OS << ", " << TargetName From da996a3805f43333d1d391b223ec756d53f5e830 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 10 Nov 2025 14:43:46 -0800 Subject: [PATCH 27/30] [NFC][SpecialCaseList] Refactor error handling (#167277) Will help in future patches. --- llvm/lib/Support/SpecialCaseList.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 395a55d75acd4..beec8b8ef0d5b 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -274,11 +274,12 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, bool RemoveDotSlash = Version > 2; - Section *CurrentSection; - if (auto Err = addSection("*", FileIdx, 1, true).moveInto(CurrentSection)) { + auto ErrOrSection = addSection("*", FileIdx, 1, true); + if (auto Err = ErrOrSection.takeError()) { Error = toString(std::move(Err)); return false; } + Section *CurrentSection = ErrOrSection.get(); // This is the current list of prefixes for all existing users matching file // path. We may need parametrization in constructor in future. @@ -300,12 +301,13 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, return false; } - if (auto Err = addSection(Line.drop_front().drop_back(), FileIdx, LineNo, - UseGlobs) - .moveInto(CurrentSection)) { + auto ErrOrSection = + addSection(Line.drop_front().drop_back(), FileIdx, LineNo, UseGlobs); + if (auto Err = ErrOrSection.takeError()) { Error = toString(std::move(Err)); return false; } + CurrentSection = ErrOrSection.get(); continue; } From 5b8e86909c94ad81b4c2f0cee50c3433ec318788 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Mon, 10 Nov 2025 14:48:08 -0800 Subject: [PATCH 28/30] [llc] Fix save-stats test in read only directories (#167403) The save stats test would fail downstream due to the tests being run from a read only directory. The cwd flag would attempt to place the statistics in that read only directory, causing an error. This patch ensures that the tests are always run from a temp directory. --- llvm/test/tools/llc/save-stats.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/tools/llc/save-stats.ll b/llvm/test/tools/llc/save-stats.ll index 4950625c809cc..a5769f86648dc 100644 --- a/llvm/test/tools/llc/save-stats.ll +++ b/llvm/test/tools/llc/save-stats.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts +; RUN: rm -rf %t.dir && mkdir -p %t.dir && cd %t.dir ; RUN: llc --save-stats=obj -o %t.s %s && cat %t.stats | FileCheck %s ; RUN: llc --save-stats=cwd -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s ; RUN: llc --save-stats -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s From c5ce8021da09cfe232abe2e869269297b00fbcdb Mon Sep 17 00:00:00 2001 From: Sterling-Augustine Date: Mon, 10 Nov 2025 14:58:26 -0800 Subject: [PATCH 29/30] [libc] fwrite_unlocked: only return errno if an actual error occurred. (#167350) fwrite and friends don't modify errno if no error occurred. Therefore frite_unlocked's return value shouldn't be constructed from errno without checking if an error actually occurred. This fixes an error introduced by https://github.com/llvm/llvm-project/commit/9e2f73fe9052a4fbf382a06e30b2441c6d99fb7e --- libc/src/stdio/printf_core/vfprintf_internal.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h index 564441d3bf51a..c47a03d741f98 100644 --- a/libc/src/stdio/printf_core/vfprintf_internal.h +++ b/libc/src/stdio/printf_core/vfprintf_internal.h @@ -51,8 +51,11 @@ LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); } LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, ::FILE *f) { // Need to use system errno in this case, as system write will set this errno - // which we need to propagate back into our code. - return {::fwrite_unlocked(ptr, size, nmemb, f), errno}; + // which we need to propagate back into our code. fwrite only modifies errno + // if there was an error, and errno may have previously been nonzero. Only + // return errno if there was an error. + size_t members_written = ::fwrite_unlocked(ptr, size, nmemb, f); + return {members_written, members_written == nmemb ? 0 : errno}; } #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace internal From 3a7359545ae533a59783fe6f9815b421ca1443dd Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Tue, 11 Nov 2025 04:54:50 -0600 Subject: [PATCH 30/30] Revert "Reland "[clang] Refactor option-related code from clangDriver into new clangOptions library" (#167374)" This reverts commit f63d33da0a517a9a7096e0e67defb50c5995dd41. --- clang-tools-extra/clangd/CMakeLists.txt | 1 - clang-tools-extra/clangd/CompileCommands.cpp | 27 +- clang-tools-extra/modularize/CMakeLists.txt | 1 - .../modularize/CoverageChecker.cpp | 6 +- clang-tools-extra/modularize/Modularize.cpp | 4 +- .../modularize/ModularizeUtilities.cpp | 6 +- clang-tools-extra/pp-trace/CMakeLists.txt | 1 - clang-tools-extra/pp-trace/PPTrace.cpp | 2 +- clang/docs/CMakeLists.txt | 2 +- clang/docs/InternalsManual.rst | 8 +- clang/docs/ReleaseNotes.rst | 3 - clang/include/clang/CMakeLists.txt | 2 +- .../clang/{Options => Driver}/CMakeLists.txt | 0 .../{Options => Driver}/ClangOptionDocs.td | 0 clang/include/clang/Driver/Driver.h | 2 +- .../clang/{Options => Driver}/OptionUtils.h | 6 +- .../clang/{Options => Driver}/Options.h | 20 +- .../clang/{Options => Driver}/Options.td | 0 clang/include/clang/Frontend/Utils.h | 2 +- clang/include/module.modulemap | 1 - clang/lib/CMakeLists.txt | 1 - clang/lib/Driver/CMakeLists.txt | 3 +- clang/lib/Driver/Compilation.cpp | 2 +- clang/lib/Driver/Driver.cpp | 3 +- .../lib/{Options => Driver}/DriverOptions.cpp | 19 +- clang/lib/{Options => Driver}/OptionUtils.cpp | 2 +- clang/lib/Driver/SanitizerArgs.cpp | 2 +- clang/lib/Driver/ToolChain.cpp | 6 +- clang/lib/Driver/ToolChains/AIX.cpp | 6 +- clang/lib/Driver/ToolChains/AMDGPU.cpp | 21 +- clang/lib/Driver/ToolChains/AMDGPU.h | 2 +- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 3 +- clang/lib/Driver/ToolChains/AVR.cpp | 2 +- clang/lib/Driver/ToolChains/Arch/AArch64.cpp | 4 +- clang/lib/Driver/ToolChains/Arch/ARM.cpp | 4 +- clang/lib/Driver/ToolChains/Arch/CSKY.cpp | 6 +- .../lib/Driver/ToolChains/Arch/LoongArch.cpp | 5 +- clang/lib/Driver/ToolChains/Arch/M68k.cpp | 16 +- clang/lib/Driver/ToolChains/Arch/Mips.cpp | 5 +- clang/lib/Driver/ToolChains/Arch/PPC.cpp | 2 +- clang/lib/Driver/ToolChains/Arch/RISCV.cpp | 2 +- clang/lib/Driver/ToolChains/Arch/Sparc.cpp | 4 +- clang/lib/Driver/ToolChains/Arch/SystemZ.cpp | 10 +- clang/lib/Driver/ToolChains/Arch/VE.cpp | 2 +- clang/lib/Driver/ToolChains/Arch/X86.cpp | 18 +- clang/lib/Driver/ToolChains/BareMetal.cpp | 4 +- clang/lib/Driver/ToolChains/CSKYToolChain.cpp | 2 +- clang/lib/Driver/ToolChains/Clang.cpp | 13 +- clang/lib/Driver/ToolChains/CommonArgs.cpp | 24 +- clang/lib/Driver/ToolChains/CrossWindows.cpp | 2 +- clang/lib/Driver/ToolChains/Cuda.cpp | 8 +- clang/lib/Driver/ToolChains/Cygwin.cpp | 4 +- clang/lib/Driver/ToolChains/Darwin.cpp | 6 +- clang/lib/Driver/ToolChains/DragonFly.cpp | 4 +- clang/lib/Driver/ToolChains/Flang.cpp | 7 +- clang/lib/Driver/ToolChains/FreeBSD.cpp | 4 +- clang/lib/Driver/ToolChains/Fuchsia.cpp | 4 +- clang/lib/Driver/ToolChains/Gnu.cpp | 10 +- clang/lib/Driver/ToolChains/HIPAMD.cpp | 2 +- clang/lib/Driver/ToolChains/HIPSPV.cpp | 4 +- clang/lib/Driver/ToolChains/HIPUtility.cpp | 2 +- clang/lib/Driver/ToolChains/Hexagon.cpp | 2 +- clang/lib/Driver/ToolChains/Hurd.cpp | 4 +- clang/lib/Driver/ToolChains/Linux.cpp | 4 +- clang/lib/Driver/ToolChains/MSP430.cpp | 2 +- clang/lib/Driver/ToolChains/MSVC.cpp | 2 +- clang/lib/Driver/ToolChains/Managarm.cpp | 4 +- clang/lib/Driver/ToolChains/MinGW.cpp | 2 +- clang/lib/Driver/ToolChains/MipsLinux.cpp | 4 +- clang/lib/Driver/ToolChains/NetBSD.cpp | 4 +- clang/lib/Driver/ToolChains/OHOS.cpp | 4 +- clang/lib/Driver/ToolChains/OpenBSD.cpp | 4 +- clang/lib/Driver/ToolChains/PPCFreeBSD.cpp | 4 +- clang/lib/Driver/ToolChains/PPCLinux.cpp | 4 +- clang/lib/Driver/ToolChains/PS4CPU.cpp | 2 +- clang/lib/Driver/ToolChains/SPIRV.cpp | 2 +- clang/lib/Driver/ToolChains/SYCL.cpp | 2 +- clang/lib/Driver/ToolChains/Solaris.cpp | 4 +- clang/lib/Driver/ToolChains/UEFI.cpp | 2 +- clang/lib/Driver/ToolChains/VEToolchain.cpp | 6 +- clang/lib/Driver/ToolChains/WebAssembly.cpp | 6 +- clang/lib/Driver/ToolChains/XCore.cpp | 6 +- clang/lib/Driver/ToolChains/ZOS.cpp | 2 +- clang/lib/Driver/XRayArgs.cpp | 2 +- clang/lib/Frontend/CMakeLists.txt | 1 - clang/lib/Frontend/CompilerInvocation.cpp | 54 +- .../CreateInvocationFromCommandLine.cpp | 6 +- clang/lib/FrontendTool/CMakeLists.txt | 1 - .../ExecuteCompilerInvocation.cpp | 6 +- clang/lib/Interpreter/Interpreter.cpp | 4 +- clang/lib/Interpreter/InterpreterUtils.h | 2 +- clang/lib/Options/CMakeLists.txt | 18 - clang/lib/Tooling/CMakeLists.txt | 1 - .../InterpolatingCompilationDatabase.cpp | 12 +- clang/lib/Tooling/Tooling.cpp | 12 +- clang/tools/clang-check/CMakeLists.txt | 1 - clang/tools/clang-check/ClangCheck.cpp | 4 +- clang/tools/clang-installapi/CMakeLists.txt | 1 - .../clang-installapi/ClangInstallAPI.cpp | 4 +- clang/tools/clang-installapi/Options.cpp | 55 +- clang/tools/driver/CMakeLists.txt | 1 - clang/tools/driver/cc1_main.cpp | 2 +- clang/tools/driver/cc1as_main.cpp | 8 +- clang/tools/driver/driver.cpp | 2 +- clang/unittests/Driver/DXCModeTest.cpp | 4 +- clang/www/OpenProjects.html | 2 +- flang/docs/CMakeLists.txt | 2 +- flang/docs/FlangDriver.md | 8 +- flang/lib/Frontend/CMakeLists.txt | 1 - flang/lib/Frontend/CompilerInvocation.cpp | 553 ++++++++++-------- flang/lib/FrontendTool/CMakeLists.txt | 1 - .../ExecuteCompilerInvocation.cpp | 6 +- flang/tools/flang-driver/CMakeLists.txt | 1 - flang/tools/flang-driver/driver.cpp | 4 +- .../Platform/MacOSX/PlatformDarwin.cpp | 4 +- llvm/include/llvm/MC/MCAsmInfo.h | 2 +- llvm/include/llvm/Option/Arg.h | 2 +- revert_patches.txt | 3 + .../llvm-project-overlay/clang/BUILD.bazel | 4 +- 119 files changed, 623 insertions(+), 591 deletions(-) rename clang/include/clang/{Options => Driver}/CMakeLists.txt (100%) rename clang/include/clang/{Options => Driver}/ClangOptionDocs.td (100%) rename clang/include/clang/{Options => Driver}/OptionUtils.h (94%) rename clang/include/clang/{Options => Driver}/Options.h (83%) rename clang/include/clang/{Options => Driver}/Options.td (100%) rename clang/lib/{Options => Driver}/DriverOptions.cpp (76%) rename clang/lib/{Options => Driver}/OptionUtils.cpp (97%) delete mode 100644 clang/lib/Options/CMakeLists.txt diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt index d7ec853af862f..fb3f05329be21 100644 --- a/clang-tools-extra/clangd/CMakeLists.txt +++ b/clang-tools-extra/clangd/CMakeLists.txt @@ -165,7 +165,6 @@ clang_target_link_libraries(clangDaemon clangBasic clangDependencyScanning clangDriver - clangOptions clangFormat clangFrontend clangIndex diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp index 7990f2719e9a0..c1be93730129a 100644 --- a/clang-tools-extra/clangd/CompileCommands.cpp +++ b/clang-tools-extra/clangd/CompileCommands.cpp @@ -11,8 +11,8 @@ #include "support/Logger.h" #include "support/Trace.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInvocation.h" -#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/ADT/ArrayRef.h" @@ -206,7 +206,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, if (Cmd.empty()) return; - auto &OptTable = getDriverOptTable(); + auto &OptTable = clang::driver::getDriverOptTable(); // OriginalArgs needs to outlive ArgList. llvm::SmallVector OriginalArgs; OriginalArgs.reserve(Cmd.size()); @@ -222,8 +222,8 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, llvm::opt::InputArgList ArgList; ArgList = OptTable.ParseArgs( llvm::ArrayRef(OriginalArgs).drop_front(), IgnoredCount, IgnoredCount, - llvm::opt::Visibility(IsCLMode ? options::CLOption - : options::ClangOption)); + llvm::opt::Visibility(IsCLMode ? driver::options::CLOption + : driver::options::ClangOption)); llvm::SmallVector IndicesToDrop; // Having multiple architecture options (e.g. when building fat binaries) @@ -232,7 +232,7 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, // As there are no signals to figure out which one user actually wants. They // can explicitly specify one through `CompileFlags.Add` if need be. unsigned ArchOptCount = 0; - for (auto *Input : ArgList.filtered(options::OPT_arch)) { + for (auto *Input : ArgList.filtered(driver::options::OPT_arch)) { ++ArchOptCount; for (auto I = 0U; I <= Input->getNumValues(); ++I) IndicesToDrop.push_back(Input->getIndex() + I); @@ -262,12 +262,13 @@ void CommandMangler::operator()(tooling::CompileCommand &Command, // explicitly at the end of the flags. This ensures modifications done in the // following steps apply in more cases (like setting -x, which only affects // inputs that come after it). - for (auto *Input : ArgList.filtered(options::OPT_INPUT)) { + for (auto *Input : ArgList.filtered(driver::options::OPT_INPUT)) { SawInput(Input->getValue(0)); IndicesToDrop.push_back(Input->getIndex()); } // Anything after `--` is also treated as input, drop them as well. - if (auto *DashDash = ArgList.getLastArgNoClaim(options::OPT__DASH_DASH)) { + if (auto *DashDash = + ArgList.getLastArgNoClaim(driver::options::OPT__DASH_DASH)) { auto DashDashIndex = DashDash->getIndex() + 1; // +1 accounts for Cmd[0] // Another +1 so we don't treat the `--` itself as an input. for (unsigned I = DashDashIndex + 1; I < Cmd.size(); ++I) @@ -423,11 +424,11 @@ DriverMode getDriverMode(const std::vector &Args) { // Returns the set of DriverModes where an option may be used. unsigned char getModes(const llvm::opt::Option &Opt) { unsigned char Result = DM_None; - if (Opt.hasVisibilityFlag(options::ClangOption)) + if (Opt.hasVisibilityFlag(driver::options::ClangOption)) Result |= DM_GCC; - if (Opt.hasVisibilityFlag(options::CC1Option)) + if (Opt.hasVisibilityFlag(driver::options::CC1Option)) Result |= DM_CC1; - if (Opt.hasVisibilityFlag(options::CLOption)) + if (Opt.hasVisibilityFlag(driver::options::CLOption)) Result |= DM_CL; return Result; } @@ -441,8 +442,8 @@ llvm::ArrayRef ArgStripper::rulesFor(llvm::StringRef Arg) { using TableTy = llvm::StringMap, llvm::BumpPtrAllocator>; static TableTy *Table = [] { - auto &DriverTable = getDriverOptTable(); - using DriverID = clang::options::ID; + auto &DriverTable = driver::getDriverOptTable(); + using DriverID = clang::driver::options::ID; // Collect sets of aliases, so we can treat -foo and -foo= as synonyms. // Conceptually a double-linked list: PrevAlias[I] -> I -> NextAlias[I]. @@ -467,7 +468,7 @@ llvm::ArrayRef ArgStripper::rulesFor(llvm::StringRef Arg) { FLAGS, VISIBILITY, PARAM, HELPTEXT, HELPTEXTSFORVARIANTS, \ METAVAR, VALUES, SUBCOMMANDIDS_OFFSET) \ {DriverID::OPT_##ID, DriverID::OPT_##ALIAS, ALIASARGS}, -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef OPTION }; for (auto &E : AliasTable) diff --git a/clang-tools-extra/modularize/CMakeLists.txt b/clang-tools-extra/modularize/CMakeLists.txt index a775b790a3147..eb5383c3ad44e 100644 --- a/clang-tools-extra/modularize/CMakeLists.txt +++ b/clang-tools-extra/modularize/CMakeLists.txt @@ -20,7 +20,6 @@ clang_target_link_libraries(modularize clangAST clangBasic clangDriver - clangOptions clangFrontend clangLex clangSerialization diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp index d80d78c64c6e2..1345a6ef8f489 100644 --- a/clang-tools-extra/modularize/CoverageChecker.cpp +++ b/clang-tools-extra/modularize/CoverageChecker.cpp @@ -50,18 +50,18 @@ // //===----------------------------------------------------------------------===// -#include "CoverageChecker.h" #include "ModularizeUtilities.h" #include "clang/AST/ASTConsumer.h" +#include "CoverageChecker.h" #include "clang/AST/ASTContext.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/SourceManager.h" +#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/PPCallbacks.h" #include "clang/Lex/Preprocessor.h" -#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Option.h" @@ -73,7 +73,7 @@ using namespace Modularize; using namespace clang; using namespace clang::driver; -using namespace clang::options; +using namespace clang::driver::options; using namespace clang::tooling; namespace cl = llvm::cl; namespace sys = llvm::sys; diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp index 33966b44f719a..376ad0c7875bf 100644 --- a/clang-tools-extra/modularize/Modularize.cpp +++ b/clang-tools-extra/modularize/Modularize.cpp @@ -231,11 +231,11 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/SourceManager.h" +#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/Preprocessor.h" -#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Arg.h" @@ -254,7 +254,7 @@ using namespace clang; using namespace clang::driver; -using namespace clang::options; +using namespace clang::driver::options; using namespace clang::tooling; using namespace llvm; using namespace llvm::opt; diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp index 6978a6b2fe1b7..4dd84feac5df4 100644 --- a/clang-tools-extra/modularize/ModularizeUtilities.cpp +++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp @@ -12,17 +12,17 @@ // //===----------------------------------------------------------------------===// -#include "ModularizeUtilities.h" -#include "CoverageChecker.h" #include "clang/Basic/SourceManager.h" +#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendActions.h" -#include "clang/Options/Options.h" +#include "CoverageChecker.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/FileUtilities.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" +#include "ModularizeUtilities.h" using namespace clang; using namespace llvm; diff --git a/clang-tools-extra/pp-trace/CMakeLists.txt b/clang-tools-extra/pp-trace/CMakeLists.txt index da36582ee0234..1323adbc35269 100644 --- a/clang-tools-extra/pp-trace/CMakeLists.txt +++ b/clang-tools-extra/pp-trace/CMakeLists.txt @@ -14,7 +14,6 @@ clang_target_link_libraries(pp-trace PRIVATE clangAST clangBasic - clangOptions clangFrontend clangLex clangSerialization diff --git a/clang-tools-extra/pp-trace/PPTrace.cpp b/clang-tools-extra/pp-trace/PPTrace.cpp index ba5a06a26830d..0b078c49a55b7 100644 --- a/clang-tools-extra/pp-trace/PPTrace.cpp +++ b/clang-tools-extra/pp-trace/PPTrace.cpp @@ -28,11 +28,11 @@ #include "clang/AST/ASTConsumer.h" #include "clang/AST/ASTContext.h" #include "clang/Basic/SourceManager.h" +#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/Preprocessor.h" -#include "clang/Options/Options.h" #include "clang/Tooling/Execution.h" #include "clang/Tooling/Tooling.h" #include "llvm/Option/Arg.h" diff --git a/clang/docs/CMakeLists.txt b/clang/docs/CMakeLists.txt index 9469a832adb62..1f06c040c96cb 100644 --- a/clang/docs/CMakeLists.txt +++ b/clang/docs/CMakeLists.txt @@ -132,7 +132,7 @@ if (LLVM_ENABLE_SPHINX) # Generated files gen_rst_file_from_td(AttributeReference.rst -gen-attr-docs ../include/clang/Basic/Attr.td "${docs_targets}") gen_rst_file_from_td(DiagnosticsReference.rst -gen-diag-docs ../include/clang/Basic/Diagnostic.td "${docs_targets}") - gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Options/ClangOptionDocs.td "${docs_targets}") + gen_rst_file_from_td(ClangCommandLineReference.rst -gen-opt-docs ../include/clang/Driver/ClangOptionDocs.td "${docs_targets}") # Another generated file from a different source set(docs_tools_dir ${CMAKE_CURRENT_SOURCE_DIR}/tools) diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst index a849d05eb7ae9..eff46ab46e1ca 100644 --- a/clang/docs/InternalsManual.rst +++ b/clang/docs/InternalsManual.rst @@ -667,7 +667,7 @@ Command Line Interface ---------------------- The command line interface of the Clang ``-cc1`` frontend is defined alongside -the driver options in ``clang/Options/Options.td``. The information making up an +the driver options in ``clang/Driver/Options.td``. The information making up an option definition includes its prefix and name (for example ``-std=``), form and position of the option value, help text, aliases and more. Each option may belong to a certain group and can be marked with zero or more flags. Options @@ -712,7 +712,7 @@ variable for the option value: } Next, declare the command line interface of the option in the tablegen file -``clang/include/clang/Options/Options.td``. This is done by instantiating the +``clang/include/clang/Driver/Options.td``. This is done by instantiating the ``Option`` class (defined in ``llvm/include/llvm/Option/OptParser.td``). The instance is typically created through one of the helper classes that encode the acceptable ways to specify the option value on the command line: @@ -906,7 +906,7 @@ command line: SHOULD_PARSE, KEYPATH, DEFAULT_VALUE, \ IMPLIED_CHECK, IMPLIED_VALUE, NORMALIZER, \ MERGER, TABLE_INDEX) - #include "clang/Options/Options.inc" + #include "clang/Driver/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // ... @@ -925,7 +925,7 @@ command line: GENERATE_OPTION_WITH_MARSHALLING( \ Args, SA, KIND, FLAGS, SPELLING, ALWAYS_EMIT, KEYPATH, DEFAULT_VALUE, \ IMPLIED_CHECK, IMPLIED_VALUE, DENORMALIZER, EXTRACTOR, TABLE_INDEX) - #include "clang/Options/Options.inc" + #include "clang/Driver/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // ... diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 9d3ac48d2e0ca..5f356daec2d04 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -79,9 +79,6 @@ Potentially Breaking Changes void foo(void) { return ({ 1;; }); } -- Downstream projects that previously linked only against ``clangDriver`` may - now (also) need to link against the new ``clangOptions`` library, since - options-related code has been moved out of the Driver into a separate library. C/C++ Language Potentially Breaking Changes ------------------------------------------- diff --git a/clang/include/clang/CMakeLists.txt b/clang/include/clang/CMakeLists.txt index 77a44e4c48de5..47ac70cd21690 100644 --- a/clang/include/clang/CMakeLists.txt +++ b/clang/include/clang/CMakeLists.txt @@ -3,7 +3,7 @@ add_subdirectory(Basic) if(CLANG_ENABLE_CIR) add_subdirectory(CIR) endif() -add_subdirectory(Options) +add_subdirectory(Driver) add_subdirectory(Parse) add_subdirectory(Sema) add_subdirectory(Serialization) diff --git a/clang/include/clang/Options/CMakeLists.txt b/clang/include/clang/Driver/CMakeLists.txt similarity index 100% rename from clang/include/clang/Options/CMakeLists.txt rename to clang/include/clang/Driver/CMakeLists.txt diff --git a/clang/include/clang/Options/ClangOptionDocs.td b/clang/include/clang/Driver/ClangOptionDocs.td similarity index 100% rename from clang/include/clang/Options/ClangOptionDocs.td rename to clang/include/clang/Driver/ClangOptionDocs.td diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h index 297afe26812a0..419089731569b 100644 --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -15,11 +15,11 @@ #include "clang/Driver/Action.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/InputInfo.h" +#include "clang/Driver/Options.h" #include "clang/Driver/Phases.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Types.h" #include "clang/Driver/Util.h" -#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/StringMap.h" diff --git a/clang/include/clang/Options/OptionUtils.h b/clang/include/clang/Driver/OptionUtils.h similarity index 94% rename from clang/include/clang/Options/OptionUtils.h rename to clang/include/clang/Driver/OptionUtils.h index 83c48bd7d6843..922f536bf33ea 100644 --- a/clang/include/clang/Options/OptionUtils.h +++ b/clang/include/clang/Driver/OptionUtils.h @@ -10,8 +10,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_OPTIONS_OPTIONUTILS_H -#define LLVM_CLANG_OPTIONS_OPTIONUTILS_H +#ifndef LLVM_CLANG_DRIVER_OPTIONUTILS_H +#define LLVM_CLANG_DRIVER_OPTIONUTILS_H #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LLVM.h" @@ -55,4 +55,4 @@ inline uint64_t getLastArgUInt64Value(const llvm::opt::ArgList &Args, } // namespace clang -#endif // LLVM_CLANG_OPTIONS_OPTIONUTILS_H +#endif // LLVM_CLANG_DRIVER_OPTIONUTILS_H diff --git a/clang/include/clang/Options/Options.h b/clang/include/clang/Driver/Options.h similarity index 83% rename from clang/include/clang/Options/Options.h rename to clang/include/clang/Driver/Options.h index ac98699001965..0797410e9940e 100644 --- a/clang/include/clang/Options/Options.h +++ b/clang/include/clang/Driver/Options.h @@ -6,13 +6,14 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_OPTIONS_OPTIONS_H -#define LLVM_CLANG_OPTIONS_OPTIONS_H +#ifndef LLVM_CLANG_DRIVER_OPTIONS_H +#define LLVM_CLANG_DRIVER_OPTIONS_H #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" namespace clang { +namespace driver { namespace options { /// Flags specifically for clang options. Must not overlap with @@ -41,15 +42,16 @@ enum ClangVisibility { }; enum ID { - OPT_INVALID = 0, // This is not an option ID. + OPT_INVALID = 0, // This is not an option ID. #define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__), -#include "clang/Options/Options.inc" - LastOption +#include "clang/Driver/Options.inc" + LastOption #undef OPTION -}; -} // namespace options + }; +} const llvm::opt::OptTable &getDriverOptTable(); -} // namespace clang +} +} -#endif // LLVM_CLANG_OPTIONS_OPTIONS_H +#endif diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Driver/Options.td similarity index 100% rename from clang/include/clang/Options/Options.td rename to clang/include/clang/Driver/Options.td diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h index ed2703c76f18d..49fd920d1ec43 100644 --- a/clang/include/clang/Frontend/Utils.h +++ b/clang/include/clang/Frontend/Utils.h @@ -15,8 +15,8 @@ #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LLVM.h" +#include "clang/Driver/OptionUtils.h" #include "clang/Frontend/DependencyOutputOptions.h" -#include "clang/Options/OptionUtils.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/StringMap.h" diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap index a11c8683c601e..c5535262ae38c 100644 --- a/clang/include/module.modulemap +++ b/clang/include/module.modulemap @@ -146,7 +146,6 @@ module Clang_Lex { module * { export * } } -module Clang_Options { requires cplusplus umbrella "clang/Options" module * { export * } } module Clang_Parse { requires cplusplus umbrella "clang/Parse" module * { export * } } module Clang_Rewrite { requires cplusplus umbrella "clang/Rewrite/Core" module * { export * } } module Clang_RewriteFrontend { requires cplusplus umbrella "clang/Rewrite/Frontend" module * { export * } } diff --git a/clang/lib/CMakeLists.txt b/clang/lib/CMakeLists.txt index e90b009da606a..4f2218b583e41 100644 --- a/clang/lib/CMakeLists.txt +++ b/clang/lib/CMakeLists.txt @@ -13,7 +13,6 @@ add_subdirectory(Edit) add_subdirectory(ExtractAPI) add_subdirectory(Rewrite) add_subdirectory(Driver) -add_subdirectory(Options) add_subdirectory(Serialization) add_subdirectory(Frontend) add_subdirectory(FrontendTool) diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt index 8577b3af257a7..a8d83d38264a7 100644 --- a/clang/lib/Driver/CMakeLists.txt +++ b/clang/lib/Driver/CMakeLists.txt @@ -20,10 +20,12 @@ add_clang_library(clangDriver Compilation.cpp Distro.cpp Driver.cpp + DriverOptions.cpp Job.cpp Multilib.cpp MultilibBuilder.cpp OffloadBundler.cpp + OptionUtils.cpp Phases.cpp SanitizerArgs.cpp Tool.cpp @@ -99,7 +101,6 @@ add_clang_library(clangDriver LINK_LIBS clangBasic clangLex - clangOptions ${system_libs} ${LLVM_PTHREAD_LIB} ) diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp index 4a75ae2c74c51..665d81f99ba45 100644 --- a/clang/lib/Driver/Compilation.cpp +++ b/clang/lib/Driver/Compilation.cpp @@ -11,9 +11,9 @@ #include "clang/Driver/Action.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" +#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/OptSpecifier.h" #include "llvm/Option/Option.h" diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index fb892ae02be9a..89c4b7363f21c 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -60,6 +60,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" +#include "clang/Driver/Options.h" +#include "clang/Driver/OptionUtils.h" #include "clang/Driver/Phases.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/Tool.h" @@ -67,7 +69,6 @@ #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" #include "clang/Lex/DependencyDirectivesScanner.h" -#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" diff --git a/clang/lib/Options/DriverOptions.cpp b/clang/lib/Driver/DriverOptions.cpp similarity index 76% rename from clang/lib/Options/DriverOptions.cpp rename to clang/lib/Driver/DriverOptions.cpp index d91e9291fb2f6..cde1f8989935b 100644 --- a/clang/lib/Options/DriverOptions.cpp +++ b/clang/lib/Driver/DriverOptions.cpp @@ -6,32 +6,33 @@ // //===----------------------------------------------------------------------===// -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/OptTable.h" #include -using namespace clang::options; +using namespace clang::driver; +using namespace clang::driver::options; using namespace llvm::opt; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef OPTTABLE_STR_TABLE_CODE #define OPTTABLE_VALUES_CODE -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef OPTTABLE_VALUES_CODE #define OPTTABLE_PREFIXES_TABLE_CODE -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef OPTTABLE_PREFIXES_TABLE_CODE #define OPTTABLE_PREFIXES_UNION_CODE -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef OPTTABLE_PREFIXES_UNION_CODE static constexpr OptTable::Info InfoTable[] = { #define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__), -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef OPTION }; @@ -43,9 +44,9 @@ class DriverOptTable : public PrecomputedOptTable { : PrecomputedOptTable(OptionStrTable, OptionPrefixesTable, InfoTable, OptionPrefixesUnion) {} }; -} // anonymous namespace +} -const llvm::opt::OptTable &clang::getDriverOptTable() { +const llvm::opt::OptTable &clang::driver::getDriverOptTable() { static DriverOptTable Table; return Table; } diff --git a/clang/lib/Options/OptionUtils.cpp b/clang/lib/Driver/OptionUtils.cpp similarity index 97% rename from clang/lib/Options/OptionUtils.cpp rename to clang/lib/Driver/OptionUtils.cpp index fcafd3c83c6b3..1f36ffc03cab3 100644 --- a/clang/lib/Options/OptionUtils.cpp +++ b/clang/lib/Driver/OptionUtils.cpp @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "clang/Options/OptionUtils.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticDriver.h" +#include "clang/Driver/OptionUtils.h" #include "llvm/Option/ArgList.h" using namespace clang; diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 420c4cddbc8dd..5dd48f53b9069 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -8,8 +8,8 @@ #include "clang/Driver/SanitizerArgs.h" #include "clang/Basic/Sanitizers.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" -#include "clang/Options/Options.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 5ee53c01d6e30..3452ce5339174 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -21,9 +21,9 @@ #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/XRayArgs.h" -#include "clang/Options/Options.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" @@ -338,7 +338,7 @@ static void getRISCVMultilibFlags(const Driver &D, const llvm::Triple &Triple, Multilib::flags_list ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const { - using namespace clang::options; + using namespace clang::driver::options; std::vector Result; const llvm::Triple Triple(ComputeEffectiveClangTriple(Args)); @@ -1816,7 +1816,7 @@ void ToolChain::TranslateXarchArgs( unsigned Index = BaseArgs.MakeIndex(A->getValue(ValuePos)); unsigned Prev = Index; std::unique_ptr XarchArg(Opts.ParseOneArg( - Args, Index, llvm::opt::Visibility(options::ClangOption))); + Args, Index, llvm::opt::Visibility(clang::driver::options::ClangOption))); // If the argument parsing failed or more than one argument was // consumed, the -Xarch_ argument's parameter tried to consume diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp index 21e7dcad3cdd0..ffd7b69205440 100644 --- a/clang/lib/Driver/ToolChains/AIX.cpp +++ b/clang/lib/Driver/ToolChains/AIX.cpp @@ -9,8 +9,8 @@ #include "AIX.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" @@ -19,7 +19,6 @@ #include using AIX = clang::driver::toolchains::AIX; -using namespace clang; using namespace clang::driver; using namespace clang::driver::tools; using namespace clang::driver::toolchains; @@ -168,7 +167,8 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA, Args.hasArg(options::OPT_coverage)) CmdArgs.push_back("-bdbg:namedsects:ss"); - if (Arg *A = Args.getLastArg(options::OPT_mxcoff_build_id_EQ)) { + if (Arg *A = + Args.getLastArg(clang::driver::options::OPT_mxcoff_build_id_EQ)) { StringRef BuildId = A->getValue(); if (BuildId[0] != '0' || BuildId[1] != 'x' || BuildId.find_if_not(llvm::isHexDigit, 2) != StringRef::npos) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 3b5411179349b..edfc62f0ad7d5 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Error.h" @@ -324,24 +324,27 @@ RocmInstallationDetector::RocmInstallationDetector( const llvm::opt::ArgList &Args, bool DetectHIPRuntime, bool DetectDeviceLib) : D(D) { Verbose = Args.hasArg(options::OPT_v); - RocmPathArg = Args.getLastArgValue(options::OPT_rocm_path_EQ); - PrintROCmSearchDirs = Args.hasArg(options::OPT_print_rocm_search_dirs); + RocmPathArg = Args.getLastArgValue(clang::driver::options::OPT_rocm_path_EQ); + PrintROCmSearchDirs = + Args.hasArg(clang::driver::options::OPT_print_rocm_search_dirs); RocmDeviceLibPathArg = - Args.getAllArgValues(options::OPT_rocm_device_lib_path_EQ); - HIPPathArg = Args.getLastArgValue(options::OPT_hip_path_EQ); - HIPStdParPathArg = Args.getLastArgValue(options::OPT_hipstdpar_path_EQ); + Args.getAllArgValues(clang::driver::options::OPT_rocm_device_lib_path_EQ); + HIPPathArg = Args.getLastArgValue(clang::driver::options::OPT_hip_path_EQ); + HIPStdParPathArg = + Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_path_EQ); HasHIPStdParLibrary = !HIPStdParPathArg.empty() && D.getVFS().exists(HIPStdParPathArg + "/hipstdpar_lib.hpp"); HIPRocThrustPathArg = - Args.getLastArgValue(options::OPT_hipstdpar_thrust_path_EQ); + Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_thrust_path_EQ); HasRocThrustLibrary = !HIPRocThrustPathArg.empty() && D.getVFS().exists(HIPRocThrustPathArg + "/thrust"); - HIPRocPrimPathArg = Args.getLastArgValue(options::OPT_hipstdpar_prim_path_EQ); + HIPRocPrimPathArg = + Args.getLastArgValue(clang::driver::options::OPT_hipstdpar_prim_path_EQ); HasRocPrimLibrary = !HIPRocPrimPathArg.empty() && D.getVFS().exists(HIPRocPrimPathArg + "/rocprim"); - if (auto *A = Args.getLastArg(options::OPT_hip_version_EQ)) { + if (auto *A = Args.getLastArg(clang::driver::options::OPT_hip_version_EQ)) { HIPVersionArg = A->getValue(); unsigned Major = ~0U; unsigned Minor = ~0U; diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index c8601d1dedbcb..7185b24aec0f8 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -11,9 +11,9 @@ #include "Gnu.h" #include "clang/Basic/TargetID.h" +#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" -#include "clang/Options/Options.h" #include "llvm/ADT/SmallString.h" #include "llvm/TargetParser/TargetParser.h" diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index 324369bfa2cc1..e73b4f02cac39 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -11,8 +11,9 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" +#include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/Tool.h" -#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp index 588255dc5a0cd..731076d9754a9 100644 --- a/clang/lib/Driver/ToolChains/AVR.cpp +++ b/clang/lib/Driver/ToolChains/AVR.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp index d6fb2a57539ed..e8d5e38a9064f 100644 --- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp +++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp @@ -9,7 +9,7 @@ #include "AArch64.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/TargetParser/Host.h" @@ -222,7 +222,7 @@ void aarch64::getAArch64TargetFeatures(const Driver &D, // Default to 'A' profile if the architecture is not specified. success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions); - if (success && (A = Args.getLastArg(options::OPT_mtune_EQ))) + if (success && (A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ))) success = getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args, Features); else if (success && (A = Args.getLastArg(options::OPT_mcpu_EQ))) diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp index 55eb2dcf7ddf4..61beb0455147d 100644 --- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp +++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp @@ -8,7 +8,7 @@ #include "ARM.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/ARMTargetParser.h" @@ -74,7 +74,7 @@ bool arm::isARMEABIBareMetal(const llvm::Triple &Triple) { // Get Arch/CPU from args. void arm::getARMArchCPUFromArgs(const ArgList &Args, llvm::StringRef &Arch, llvm::StringRef &CPU, bool FromAs) { - if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) CPU = A->getValue(); if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) Arch = A->getValue(); diff --git a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp index 65f6534e4d038..2fd2c72147f5b 100644 --- a/clang/lib/Driver/ToolChains/Arch/CSKY.cpp +++ b/clang/lib/Driver/ToolChains/Arch/CSKY.cpp @@ -8,7 +8,7 @@ #include "CSKY.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/CSKYTargetParser.h" @@ -33,7 +33,7 @@ csky::getCSKYArchName(const Driver &D, const ArgList &Args, return std::optional(A->getValue()); } - if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { llvm::CSKY::ArchKind ArchKind = llvm::CSKY::parseCPUArch(A->getValue()); if (ArchKind == llvm::CSKY::ArchKind::INVALID) { D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args); @@ -126,7 +126,7 @@ void csky::getCSKYTargetFeatures(const Driver &D, const llvm::Triple &Triple, archName = A->getValue(); } - if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { llvm::CSKY::ArchKind Kind = llvm::CSKY::parseCPUArch(A->getValue()); if (Kind == llvm::CSKY::ArchKind::INVALID) { D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args); diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp index da084bdabaee3..156ea03045569 100644 --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp @@ -11,7 +11,7 @@ #include "clang/Basic/DiagnosticDriver.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/TargetParser/Host.h" #include "llvm/TargetParser/LoongArchTargetParser.h" @@ -130,7 +130,8 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D, const ArgList &Args, std::vector &Features) { // Enable the `lsx` feature on 64-bit LoongArch by default. - if (Triple.isLoongArch64() && (!Args.hasArgNoClaim(options::OPT_march_EQ))) + if (Triple.isLoongArch64() && + (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ))) Features.push_back("+lsx"); // -mrelax is default, unless -mno-relax is specified. diff --git a/clang/lib/Driver/ToolChains/Arch/M68k.cpp b/clang/lib/Driver/ToolChains/Arch/M68k.cpp index a620597f10475..708ec84a37cfb 100644 --- a/clang/lib/Driver/ToolChains/Arch/M68k.cpp +++ b/clang/lib/Driver/ToolChains/Arch/M68k.cpp @@ -8,7 +8,7 @@ #include "M68k.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Regex.h" @@ -21,7 +21,7 @@ using namespace llvm::opt; /// getM68kTargetCPU - Get the (LLVM) name of the 68000 cpu we are targeting. std::string m68k::getM68kTargetCPU(const ArgList &Args) { - if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { + if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { // The canonical CPU name is captalize. However, we allow // starting with lower case or numbers only StringRef CPUName = A->getValue(); @@ -45,17 +45,17 @@ std::string m68k::getM68kTargetCPU(const ArgList &Args) { .Default(CPUName.str()); } // FIXME: Throw error when multiple sub-architecture flag exist - if (Args.hasArg(options::OPT_m68000)) + if (Args.hasArg(clang::driver::options::OPT_m68000)) return "M68000"; - if (Args.hasArg(options::OPT_m68010)) + if (Args.hasArg(clang::driver::options::OPT_m68010)) return "M68010"; - if (Args.hasArg(options::OPT_m68020)) + if (Args.hasArg(clang::driver::options::OPT_m68020)) return "M68020"; - if (Args.hasArg(options::OPT_m68030)) + if (Args.hasArg(clang::driver::options::OPT_m68030)) return "M68030"; - if (Args.hasArg(options::OPT_m68040)) + if (Args.hasArg(clang::driver::options::OPT_m68040)) return "M68040"; - if (Args.hasArg(options::OPT_m68060)) + if (Args.hasArg(clang::driver::options::OPT_m68060)) return "M68060"; return ""; diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp index 103aae7018fbf..8d7b85dbeed99 100644 --- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp @@ -9,7 +9,7 @@ #include "Mips.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" @@ -49,7 +49,8 @@ void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple, DefMips64CPU = "mips3"; } - if (Arg *A = Args.getLastArg(options::OPT_march_EQ, options::OPT_mcpu_EQ)) + if (Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ, + options::OPT_mcpu_EQ)) CPUName = A->getValue(); if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) { diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp index 44afdd249fea5..361a68a892a8f 100644 --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp @@ -9,7 +9,7 @@ #include "PPC.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp index 1dcce6d053a39..f2e79e71f93d4 100644 --- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp +++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp @@ -10,7 +10,7 @@ #include "../Clang.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Error.h" #include "llvm/TargetParser/Host.h" diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp index 49256d80cbdf6..94a94f1e9c487 100644 --- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp @@ -8,7 +8,7 @@ #include "Sparc.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" @@ -122,7 +122,7 @@ sparc::FloatABI sparc::getSparcFloatABI(const Driver &D, std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args, const llvm::Triple &Triple) { - if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { StringRef CPUName = A->getValue(); if (CPUName == "native") { std::string CPU = std::string(llvm::sys::getHostCPUName()); diff --git a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp index 1ef6a725483e8..75b6afd925245 100644 --- a/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp +++ b/clang/lib/Driver/ToolChains/Arch/SystemZ.cpp @@ -8,7 +8,7 @@ #include "SystemZ.h" #include "clang/Config/config.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/TargetParser/Host.h" @@ -25,9 +25,9 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D, D.Diag(diag::err_drv_unsupported_opt) << Args.getLastArg(options::OPT_mfloat_abi_EQ)->getAsString(Args); - if (Arg *A = - Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float)) - if (A->getOption().matches(options::OPT_msoft_float)) + if (Arg *A = Args.getLastArg(clang::driver::options::OPT_msoft_float, + options::OPT_mhard_float)) + if (A->getOption().matches(clang::driver::options::OPT_msoft_float)) ABI = systemz::FloatABI::Soft; return ABI; @@ -35,7 +35,7 @@ systemz::FloatABI systemz::getSystemZFloatABI(const Driver &D, std::string systemz::getSystemZTargetCPU(const ArgList &Args, const llvm::Triple &T) { - if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { llvm::StringRef CPUName = A->getValue(); if (CPUName == "native") { diff --git a/clang/lib/Driver/ToolChains/Arch/VE.cpp b/clang/lib/Driver/ToolChains/Arch/VE.cpp index c8353d7dc5f3a..adc0873586588 100644 --- a/clang/lib/Driver/ToolChains/Arch/VE.cpp +++ b/clang/lib/Driver/ToolChains/Arch/VE.cpp @@ -8,7 +8,7 @@ #include "VE.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" using namespace clang::driver; diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp index 092069b6ade56..1373905a5120e 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp @@ -8,7 +8,7 @@ #include "X86.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/Option/ArgList.h" @@ -21,7 +21,7 @@ using namespace llvm::opt; std::string x86::getX86TargetCPU(const Driver &D, const ArgList &Args, const llvm::Triple &Triple) { - if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { StringRef CPU = A->getValue(); if (CPU != "native") return std::string(CPU); @@ -119,7 +119,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, std::vector &Features) { // Claim and report unsupported -mabi=. Note: we don't support "sysv_abi" or // "ms_abi" as default function attributes. - if (const Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mabi_EQ)) { StringRef DefaultAbi = (Triple.isOSWindows() || Triple.isUEFI()) ? "ms" : "sysv"; if (A->getValue() != DefaultAbi) @@ -128,7 +128,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, } // If -march=native, autodetect the feature list. - if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { if (StringRef(A->getValue()) == "native") { for (auto &F : llvm::sys::getHostCPUFeatures()) Features.push_back( @@ -163,7 +163,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, // flags). This is a bit hacky but keeps existing usages working. We should // consider deprecating this and instead warn if the user requests external // retpoline thunks and *doesn't* request some form of retpolines. - auto SpectreOpt = options::ID::OPT_INVALID; + auto SpectreOpt = clang::driver::options::ID::OPT_INVALID; if (Args.hasArgNoClaim(options::OPT_mretpoline, options::OPT_mno_retpoline, options::OPT_mspeculative_load_hardening, options::OPT_mno_speculative_load_hardening)) { @@ -189,7 +189,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, SpectreOpt = options::OPT_mretpoline_external_thunk; } - auto LVIOpt = options::ID::OPT_INVALID; + auto LVIOpt = clang::driver::options::ID::OPT_INVALID; if (Args.hasFlag(options::OPT_mlvi_hardening, options::OPT_mno_lvi_hardening, false)) { Features.push_back("+lvi-load-hardening"); @@ -207,7 +207,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, << D.getOpts().getOptionName(options::OPT_mlvi_hardening) << D.getOpts().getOptionName(options::OPT_m_seses); - if (SpectreOpt != options::ID::OPT_INVALID) + if (SpectreOpt != clang::driver::options::ID::OPT_INVALID) D.Diag(diag::err_drv_argument_not_allowed_with) << D.getOpts().getOptionName(SpectreOpt) << D.getOpts().getOptionName(options::OPT_m_seses); @@ -219,8 +219,8 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, } } - if (SpectreOpt != options::ID::OPT_INVALID && - LVIOpt != options::ID::OPT_INVALID) { + if (SpectreOpt != clang::driver::options::ID::OPT_INVALID && + LVIOpt != clang::driver::options::ID::OPT_INVALID) { D.Diag(diag::err_drv_argument_not_allowed_with) << D.getOpts().getOptionName(SpectreOpt) << D.getOpts().getOptionName(LVIOpt); diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp index 8d598be9ffb0a..9b7f58c392885 100644 --- a/clang/lib/Driver/ToolChains/BareMetal.cpp +++ b/clang/lib/Driver/ToolChains/BareMetal.cpp @@ -18,7 +18,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/MultilibBuilder.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -135,7 +135,7 @@ static std::string computeClangRuntimesSysRoot(const Driver &D, bool BareMetal::initGCCInstallation(const llvm::Triple &Triple, const llvm::opt::ArgList &Args) { if (Args.getLastArg(options::OPT_gcc_toolchain) || - Args.getLastArg(clang::options::OPT_gcc_install_dir_EQ)) { + Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) { GCCInstallation.init(Triple, Args); return GCCInstallation.isValid(); } diff --git a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp index c561d7d38da5b..e4db3307ee3aa 100644 --- a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp +++ b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 5c5609a3c8377..c5d8dea358c0a 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -30,10 +30,10 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Distro.h" #include "clang/Driver/InputInfo.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/Types.h" #include "clang/Driver/XRayArgs.h" -#include "clang/Options/Options.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" @@ -66,7 +66,7 @@ using namespace clang; using namespace llvm::opt; static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) { - if (Arg *A = Args.getLastArg(options::OPT_C, options::OPT_CC, + if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC, options::OPT_fminimize_whitespace, options::OPT_fno_minimize_whitespace, options::OPT_fkeep_system_includes, @@ -1700,7 +1700,7 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args, AddAAPCSVolatileBitfieldArgs(Args, CmdArgs); - if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { CmdArgs.push_back("-tune-cpu"); if (strcmp(A->getValue(), "native") == 0) CmdArgs.push_back(Args.MakeArgString(llvm::sys::getHostCPUName())); @@ -2106,7 +2106,7 @@ void Clang::AddSparcTargetArgs(const ArgList &Args, CmdArgs.push_back("hard"); } - if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { StringRef Name = A->getValue(); std::string TuneCPU; if (Name == "native") @@ -2212,11 +2212,12 @@ void Clang::AddX86TargetArgs(const ArgList &Args, // Default to "generic" unless -march is present or targetting the PS4/PS5. std::string TuneCPU; - if (!Args.hasArg(options::OPT_march_EQ) && !getToolChain().getTriple().isPS()) + if (!Args.hasArg(clang::driver::options::OPT_march_EQ) && + !getToolChain().getTriple().isPS()) TuneCPU = "generic"; // Override based on -mtune. - if (const Arg *A = Args.getLastArg(options::OPT_mtune_EQ)) { + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) { StringRef Name = A->getValue(); if (Name == "native") { diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index d4f87dec572d9..52e43d1a003eb 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -31,11 +31,11 @@ #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Job.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/ToolChain.h" #include "clang/Driver/Util.h" #include "clang/Driver/XRayArgs.h" -#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" @@ -88,7 +88,8 @@ static bool addRPathCmdArg(const llvm::opt::ArgList &Args, static bool useFramePointerForTargetByDefault(const llvm::opt::ArgList &Args, const llvm::Triple &Triple) { - if (Args.hasArg(options::OPT_pg) && !Args.hasArg(options::OPT_mfentry)) + if (Args.hasArg(clang::driver::options::OPT_pg) && + !Args.hasArg(clang::driver::options::OPT_mfentry)) return true; if (Triple.isAndroid()) @@ -267,16 +268,17 @@ getFramePointerKind(const llvm::opt::ArgList &Args, // without requiring new frame records to be created. bool DefaultFP = useFramePointerForTargetByDefault(Args, Triple); - bool EnableFP = mustUseNonLeafFramePointerForTarget(Triple) || - Args.hasFlag(options::OPT_fno_omit_frame_pointer, - options::OPT_fomit_frame_pointer, DefaultFP); + bool EnableFP = + mustUseNonLeafFramePointerForTarget(Triple) || + Args.hasFlag(clang::driver::options::OPT_fno_omit_frame_pointer, + clang::driver::options::OPT_fomit_frame_pointer, DefaultFP); bool DefaultLeafFP = useLeafFramePointerForTargetByDefault(Triple) || (EnableFP && framePointerImpliesLeafFramePointer(Args, Triple)); - bool EnableLeafFP = - Args.hasFlag(options::OPT_mno_omit_leaf_frame_pointer, - options::OPT_momit_leaf_frame_pointer, DefaultLeafFP); + bool EnableLeafFP = Args.hasFlag( + clang::driver::options::OPT_mno_omit_leaf_frame_pointer, + clang::driver::options::OPT_momit_leaf_frame_pointer, DefaultLeafFP); bool FPRegReserved = EnableFP || mustMaintainValidFrameChain(Args, Triple); @@ -775,7 +777,7 @@ std::string tools::getCPUName(const Driver &D, const ArgList &Args, case llvm::Triple::ppcle: case llvm::Triple::ppc64: case llvm::Triple::ppc64le: - if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) + if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) return std::string( llvm::PPC::getNormalizedPPCTargetCPU(T, A->getValue())); return std::string(llvm::PPC::getNormalizedPPCTargetCPU(T)); @@ -1867,7 +1869,7 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, if (SanArgs.needsFuzzerInterceptors()) addSanitizerRuntime(TC, Args, CmdArgs, "fuzzer_interceptors", false, true); - if (!Args.hasArg(options::OPT_nostdlibxx)) { + if (!Args.hasArg(clang::driver::options::OPT_nostdlibxx)) { bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) && !Args.hasArg(options::OPT_static); if (OnlyLibstdcxxStatic) @@ -3583,7 +3585,7 @@ void tools::handleInterchangeLoopsArgs(const ArgList &Args, // Otherwise, return an empty string and issue a diagnosic message if needed. StringRef tools::parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags, const llvm::opt::ArgList &Args) { - Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ); + Arg *A = Args.getLastArg(clang::driver::options::OPT_mprefer_vector_width_EQ); if (!A) return ""; diff --git a/clang/lib/Driver/ToolChains/CrossWindows.cpp b/clang/lib/Driver/ToolChains/CrossWindows.cpp index 6df5315e8fff8..51c892fc91718 100644 --- a/clang/lib/Driver/ToolChains/CrossWindows.cpp +++ b/clang/lib/Driver/ToolChains/CrossWindows.cpp @@ -10,8 +10,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index de3cf7abbdec5..8fc1bded2acea 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -15,7 +15,7 @@ #include "clang/Driver/Distro.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE #include "llvm/Option/ArgList.h" @@ -154,16 +154,16 @@ CudaInstallationDetector::CudaInstallationDetector( std::initializer_list Versions = {"8.0", "7.5", "7.0"}; auto &FS = D.getVFS(); - if (Args.hasArg(options::OPT_cuda_path_EQ)) { + if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) { Candidates.emplace_back( - Args.getLastArgValue(options::OPT_cuda_path_EQ).str()); + Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str()); } else if (HostTriple.isOSWindows()) { for (const char *Ver : Versions) Candidates.emplace_back( D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" + Ver); } else { - if (!Args.hasArg(options::OPT_cuda_path_ignore_env)) { + if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) { // Try to find ptxas binary. If the executable is located in a directory // called 'bin/', its parent directory might be a good guess for a valid // CUDA installation. diff --git a/clang/lib/Driver/ToolChains/Cygwin.cpp b/clang/lib/Driver/ToolChains/Cygwin.cpp index 55438125ce0f1..d9c16347daa34 100644 --- a/clang/lib/Driver/ToolChains/Cygwin.cpp +++ b/clang/lib/Driver/ToolChains/Cygwin.cpp @@ -10,7 +10,7 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -58,7 +58,7 @@ void Cygwin::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index fc3cd9030f71d..2fb7652d64536 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -14,8 +14,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" @@ -1079,7 +1079,7 @@ StringRef MachO::getMachOArchName(const ArgList &Args) const { case llvm::Triple::thumb: case llvm::Triple::arm: - if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) + if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) if (const char *Arch = ArmMachOArchName(A->getValue())) return Arch; @@ -2993,7 +2993,7 @@ DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args, if (!BoundArch.empty()) { StringRef Name = BoundArch; const Option MCpu = Opts.getOption(options::OPT_mcpu_EQ); - const Option MArch = Opts.getOption(options::OPT_march_EQ); + const Option MArch = Opts.getOption(clang::driver::options::OPT_march_EQ); // This code must be kept in sync with LLVM's getArchTypeForDarwinArch, // which defines the list of which architectures we accept. diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp index d4a6d6ae3e349..524f5f2ff391e 100644 --- a/clang/lib/Driver/ToolChains/DragonFly.cpp +++ b/clang/lib/Driver/ToolChains/DragonFly.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -219,7 +219,7 @@ void DragonFly::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index c6038ac85514c..f24dd3982eab7 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -11,7 +11,7 @@ #include "clang/Basic/CodeGenOptions.h" #include "clang/Driver/CommonArgs.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Frontend/Debug/Options.h" #include "llvm/Support/Path.h" #include "llvm/TargetParser/Host.h" @@ -236,8 +236,9 @@ void Flang::addCodegenOptions(const ArgList &Args, options::OPT_frepack_arrays_contiguity_EQ, options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays, options::OPT_ftime_report, options::OPT_ftime_report_EQ, - options::OPT_funroll_loops, options::OPT_fno_unroll_loops}); - if (Args.hasArg(options::OPT_fcoarray)) + options::OPT_funroll_loops, options::OPT_fno_unroll_loops, + options::OPT_fdefer_desc_map, options::OPT_fno_defer_desc_map}); + if (Args.hasArg(clang::driver::options::OPT_fcoarray)) CmdArgs.push_back("-fcoarray"); } diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp index 70e66a2f5c3e7..b17b76233ad30 100644 --- a/clang/lib/Driver/ToolChains/FreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp @@ -13,8 +13,8 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" @@ -404,7 +404,7 @@ void FreeBSD::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp index 9edfc4de3d602..507cc03b27513 100644 --- a/clang/lib/Driver/ToolChains/Fuchsia.cpp +++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/MultilibBuilder.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -344,7 +344,7 @@ Tool *Fuchsia::buildStaticLibTool() const { ToolChain::RuntimeLibType Fuchsia::GetRuntimeLibType(const ArgList &Args) const { - if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) { + if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) { StringRef Value = A->getValue(); if (Value != "compiler-rt") getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index d56cb02e14ca9..b7cd5f6be14a6 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -20,9 +20,9 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/MultilibBuilder.h" +#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" -#include "clang/Options/Options.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/Twine.h" #include "llvm/Option/ArgList.h" @@ -2104,7 +2104,7 @@ Generic_GCC::GCCVersion Generic_GCC::GCCVersion::Parse(StringRef VersionText) { static llvm::StringRef getGCCToolchainDir(const ArgList &Args, llvm::StringRef SysRoot) { - const Arg *A = Args.getLastArg(options::OPT_gcc_toolchain); + const Arg *A = Args.getLastArg(clang::driver::options::OPT_gcc_toolchain); if (A) return A->getValue(); @@ -2157,7 +2157,8 @@ void Generic_GCC::GCCInstallationDetector::init( CandidateBiarchTripleAliases); // If --gcc-install-dir= is specified, skip filesystem detection. - if (const Arg *A = Args.getLastArg(options::OPT_gcc_install_dir_EQ); + if (const Arg *A = + Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ); A && A->getValue()[0]) { StringRef InstallDir = A->getValue(); if (!ScanGCCForMultilibs(TargetTriple, Args, InstallDir, false)) { @@ -2180,7 +2181,8 @@ void Generic_GCC::GCCInstallationDetector::init( // If --gcc-triple is specified use this instead of trying to // auto-detect a triple. - if (const Arg *A = Args.getLastArg(options::OPT_gcc_triple_EQ)) { + if (const Arg *A = + Args.getLastArg(clang::driver::options::OPT_gcc_triple_EQ)) { StringRef GCCTriple = A->getValue(); CandidateTripleAliases.clear(); CandidateTripleAliases.push_back(GCCTriple); diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp index b1c30beae1d35..d0d2d2e34b602 100644 --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -15,8 +15,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/TargetParser/TargetParser.h" diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp index 44265e8a8b95f..fb738577c4c44 100644 --- a/clang/lib/Driver/ToolChains/HIPSPV.cpp +++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp @@ -12,7 +12,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -213,7 +213,7 @@ HIPSPVToolChain::getDeviceLibs( // Find device libraries in --hip-device-lib-path and HIP_DEVICE_LIB_PATH. auto HipDeviceLibPathArgs = DriverArgs.getAllArgValues( // --hip-device-lib-path is alias to this option. - options::OPT_rocm_device_lib_path_EQ); + clang::driver::options::OPT_rocm_device_lib_path_EQ); for (auto Path : HipDeviceLibPathArgs) LibraryPaths.push_back(DriverArgs.MakeArgString(Path)); diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp index 1af2ae6470f1e..732403e69a075 100644 --- a/clang/lib/Driver/ToolChains/HIPUtility.cpp +++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp @@ -9,7 +9,7 @@ #include "HIPUtility.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/Archive.h" diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp index 084f51721315c..9f8b676fc7dc2 100644 --- a/clang/lib/Driver/ToolChains/Hexagon.cpp +++ b/clang/lib/Driver/ToolChains/Hexagon.cpp @@ -11,7 +11,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/Hurd.cpp b/clang/lib/Driver/ToolChains/Hurd.cpp index 53ee4d4c0cbde..43121233ea7d0 100644 --- a/clang/lib/Driver/ToolChains/Hurd.cpp +++ b/clang/lib/Driver/ToolChains/Hurd.cpp @@ -10,7 +10,7 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -168,7 +168,7 @@ void Hurd::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index 0f2ceff339eff..f452109134171 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -16,8 +16,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Distro.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Path.h" @@ -731,7 +731,7 @@ void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (DriverArgs.hasArg(clang::driver::options::OPT_fopenmp)) { diff --git a/clang/lib/Driver/ToolChains/MSP430.cpp b/clang/lib/Driver/ToolChains/MSP430.cpp index 3cc56bb7e832e..9eca1ad5f2865 100644 --- a/clang/lib/Driver/ToolChains/MSP430.cpp +++ b/clang/lib/Driver/ToolChains/MSP430.cpp @@ -12,7 +12,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/InputInfo.h" #include "clang/Driver/Multilib.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index e9972cb24c245..0d4bbabb9bb8a 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/ConvertUTF.h" diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp index 1bbabdfc631b8..da4a9072317f4 100644 --- a/clang/lib/Driver/ToolChains/Managarm.cpp +++ b/clang/lib/Driver/ToolChains/Managarm.cpp @@ -11,8 +11,8 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" @@ -136,7 +136,7 @@ void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs, const Driver &D = getDriver(); std::string SysRoot = computeSysRoot(); - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp index 0e8eeeb0523ce..bd0e40ae3d7ad 100644 --- a/clang/lib/Driver/ToolChains/MinGW.cpp +++ b/clang/lib/Driver/ToolChains/MinGW.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Config/llvm-config.h" // for LLVM_HOST_TRIPLE #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" diff --git a/clang/lib/Driver/ToolChains/MipsLinux.cpp b/clang/lib/Driver/ToolChains/MipsLinux.cpp index 58d6b5031f536..7dd3936613296 100644 --- a/clang/lib/Driver/ToolChains/MipsLinux.cpp +++ b/clang/lib/Driver/ToolChains/MipsLinux.cpp @@ -9,7 +9,7 @@ #include "MipsLinux.h" #include "Arch/Mips.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -38,7 +38,7 @@ MipsLLVMToolChain::MipsLLVMToolChain(const Driver &D, void MipsLLVMToolChain::AddClangSystemIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; const Driver &D = getDriver(); diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp index ea722b59853d6..8db00deeb80df 100644 --- a/clang/lib/Driver/ToolChains/NetBSD.cpp +++ b/clang/lib/Driver/ToolChains/NetBSD.cpp @@ -14,8 +14,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" @@ -466,7 +466,7 @@ void NetBSD::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp index 607eb714f85dc..00991504e97a8 100644 --- a/clang/lib/Driver/ToolChains/OHOS.cpp +++ b/clang/lib/Driver/ToolChains/OHOS.cpp @@ -12,8 +12,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/FileSystem.h" @@ -174,7 +174,7 @@ OHOS::OHOS(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) ToolChain::RuntimeLibType OHOS::GetRuntimeLibType( const ArgList &Args) const { - if (Arg *A = Args.getLastArg(options::OPT_rtlib_EQ)) { + if (Arg *A = Args.getLastArg(clang::driver::options::OPT_rtlib_EQ)) { StringRef Value = A->getValue(); if (Value != "compiler-rt") getDriver().Diag(clang::diag::err_drv_invalid_rtlib_name) diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp index 5e7b4f1a664e6..8f589186af343 100644 --- a/clang/lib/Driver/ToolChains/OpenBSD.cpp +++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp @@ -13,8 +13,8 @@ #include "clang/Config/config.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" #include "llvm/Support/VirtualFileSystem.h" @@ -315,7 +315,7 @@ void OpenBSD::AddClangSystemIncludeArgs( llvm::opt::ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) { diff --git a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp index 76180431ee682..8d381c4f14371 100644 --- a/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/PPCFreeBSD.cpp @@ -8,7 +8,7 @@ #include "PPCFreeBSD.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Support/Path.h" using namespace clang::driver::toolchains; @@ -16,7 +16,7 @@ using namespace llvm::opt; void PPCFreeBSDToolChain::AddClangSystemIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (!DriverArgs.hasArg(options::OPT_nostdinc) && + if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) && !DriverArgs.hasArg(options::OPT_nobuiltininc)) { const Driver &D = getDriver(); SmallString<128> P(D.ResourceDir); diff --git a/clang/lib/Driver/ToolChains/PPCLinux.cpp b/clang/lib/Driver/ToolChains/PPCLinux.cpp index 672ebd5b7b98d..768214e416bd7 100644 --- a/clang/lib/Driver/ToolChains/PPCLinux.cpp +++ b/clang/lib/Driver/ToolChains/PPCLinux.cpp @@ -8,7 +8,7 @@ #include "PPCLinux.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" @@ -58,7 +58,7 @@ PPCLinuxToolChain::PPCLinuxToolChain(const Driver &D, void PPCLinuxToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (!DriverArgs.hasArg(options::OPT_nostdinc) && + if (!DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) && !DriverArgs.hasArg(options::OPT_nobuiltininc)) { const Driver &D = getDriver(); SmallString<128> P(D.ResourceDir); diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp index 6fe18aa4cceba..34ec65ae59602 100644 --- a/clang/lib/Driver/ToolChains/PS4CPU.cpp +++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp @@ -11,8 +11,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" diff --git a/clang/lib/Driver/ToolChains/SPIRV.cpp b/clang/lib/Driver/ToolChains/SPIRV.cpp index 22aad7e3f49d7..afb9e63b4b348 100644 --- a/clang/lib/Driver/ToolChains/SPIRV.cpp +++ b/clang/lib/Driver/ToolChains/SPIRV.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/InputInfo.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" using namespace clang::driver; using namespace clang::driver::toolchains; diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 85859f344b491..0232b047a6c4b 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -20,7 +20,7 @@ SYCLInstallationDetector::SYCLInstallationDetector( void SYCLInstallationDetector::addSYCLIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(options::OPT_nobuiltininc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nobuiltininc)) return; // Add the SYCL header search locations in the specified order. diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp index ad0f41144f393..64c7d1ceb3a36 100644 --- a/clang/lib/Driver/ToolChains/Solaris.cpp +++ b/clang/lib/Driver/ToolChains/Solaris.cpp @@ -13,9 +13,9 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" #include "clang/Driver/ToolChain.h" -#include "clang/Options/Options.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" @@ -360,7 +360,7 @@ void Solaris::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { const Driver &D = getDriver(); - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) diff --git a/clang/lib/Driver/ToolChains/UEFI.cpp b/clang/lib/Driver/ToolChains/UEFI.cpp index 7732e37f8061d..d2be147c7b9f6 100644 --- a/clang/lib/Driver/ToolChains/UEFI.cpp +++ b/clang/lib/Driver/ToolChains/UEFI.cpp @@ -11,8 +11,8 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/SanitizerArgs.h" -#include "clang/Options/Options.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" diff --git a/clang/lib/Driver/ToolChains/VEToolchain.cpp b/clang/lib/Driver/ToolChains/VEToolchain.cpp index 78509bcdae0fe..ad9129046c3e1 100644 --- a/clang/lib/Driver/ToolChains/VEToolchain.cpp +++ b/clang/lib/Driver/ToolChains/VEToolchain.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/Path.h" #include // ::getenv @@ -78,7 +78,7 @@ bool VEToolChain::hasBlocksRuntime() const { return false; } void VEToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; if (DriverArgs.hasArg(options::OPT_nobuiltininc) && @@ -117,7 +117,7 @@ void VEToolChain::addClangTargetOptions(const ArgList &DriverArgs, void VEToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(options::OPT_nostdinc) || + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp index 15c6f19e87fee..5054868b5ff4d 100644 --- a/clang/lib/Driver/ToolChains/WebAssembly.cpp +++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp @@ -12,7 +12,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_STRING #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" @@ -297,7 +297,7 @@ bool WebAssembly::HasNativeLLVMSupport() const { return true; } void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs, ArgStringList &CC1Args, Action::OffloadKind) const { - if (!DriverArgs.hasFlag(options::OPT_fuse_init_array, + if (!DriverArgs.hasFlag(clang::driver::options::OPT_fuse_init_array, options::OPT_fno_use_init_array, true)) CC1Args.push_back("-fno-use-init-array"); @@ -472,7 +472,7 @@ WebAssembly::GetCXXStdlibType(const ArgList &Args) const { void WebAssembly::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(options::OPT_nostdinc)) + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; const Driver &D = getDriver(); diff --git a/clang/lib/Driver/ToolChains/XCore.cpp b/clang/lib/Driver/ToolChains/XCore.cpp index dd26c11affffb..6a2a75cb99739 100644 --- a/clang/lib/Driver/ToolChains/XCore.cpp +++ b/clang/lib/Driver/ToolChains/XCore.cpp @@ -10,7 +10,7 @@ #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include // ::getenv @@ -113,7 +113,7 @@ bool XCoreToolChain::hasBlocksRuntime() const { return false; } void XCoreToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(options::OPT_nostdinc) || + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc)) return; if (const char *cl_include_dir = getenv("XCC_C_INCLUDE_PATH")) { @@ -137,7 +137,7 @@ void XCoreToolChain::addClangTargetOptions(const ArgList &DriverArgs, void XCoreToolChain::AddClangCXXStdlibIncludeArgs( const ArgList &DriverArgs, ArgStringList &CC1Args) const { - if (DriverArgs.hasArg(options::OPT_nostdinc) || + if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc) || DriverArgs.hasArg(options::OPT_nostdlibinc) || DriverArgs.hasArg(options::OPT_nostdincxx)) return; diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp index eac8f623f9a50..9a3c45323a3cf 100644 --- a/clang/lib/Driver/ToolChains/ZOS.cpp +++ b/clang/lib/Driver/ToolChains/ZOS.cpp @@ -9,7 +9,7 @@ #include "ZOS.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Compilation.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/ArgList.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/WithColor.h" diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp index 4c2d11751a363..0325296f84b19 100644 --- a/clang/lib/Driver/XRayArgs.cpp +++ b/clang/lib/Driver/XRayArgs.cpp @@ -8,8 +8,8 @@ #include "clang/Driver/XRayArgs.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" -#include "clang/Options/Options.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/SpecialCaseList.h" diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt index dac9e0d26f393..a916667208845 100644 --- a/clang/lib/Frontend/CMakeLists.txt +++ b/clang/lib/Frontend/CMakeLists.txt @@ -52,7 +52,6 @@ add_clang_library(clangFrontend clangAST clangBasic clangDriver - clangOptions clangEdit clangLex clangParse diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 0782dc1b585c3..66759d363d531 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -27,6 +27,7 @@ #include "clang/Basic/XRayInstr.h" #include "clang/Config/config.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Frontend/CommandLineSourceLoc.h" #include "clang/Frontend/DependencyOutputOptions.h" #include "clang/Frontend/FrontendDiagnostic.h" @@ -37,7 +38,6 @@ #include "clang/Frontend/Utils.h" #include "clang/Lex/HeaderSearchOptions.h" #include "clang/Lex/PreprocessorOptions.h" -#include "clang/Options/Options.h" #include "clang/Serialization/ASTBitCodes.h" #include "clang/Serialization/ModuleFileExtension.h" #include "clang/StaticAnalyzer/Core/AnalyzerOptions.h" @@ -255,7 +255,7 @@ CowCompilerInvocation::getMutPreprocessorOutputOpts() { using ArgumentConsumer = CompilerInvocation::ArgumentConsumer; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef OPTTABLE_STR_TABLE_CODE static llvm::StringRef lookupStrInTable(unsigned Offset) { @@ -263,7 +263,7 @@ static llvm::StringRef lookupStrInTable(unsigned Offset) { } #define SIMPLE_ENUM_VALUE_TABLE -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef SIMPLE_ENUM_VALUE_TABLE static std::optional normalizeSimpleFlag(OptSpecifier Opt, @@ -989,7 +989,7 @@ static void GenerateAnalyzerArgs(const AnalyzerOptions &Opts, #define ANALYZER_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef ANALYZER_OPTION_WITH_MARSHALLING if (Opts.AnalysisConstraintsOpt != RangeConstraintsModel) { @@ -1076,7 +1076,7 @@ static bool ParseAnalyzerArgs(AnalyzerOptions &Opts, ArgList &Args, #define ANALYZER_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef ANALYZER_OPTION_WITH_MARSHALLING if (Arg *A = Args.getLastArg(OPT_analyzer_constraints)) { @@ -1583,7 +1583,7 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts, #define CODEGEN_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef CODEGEN_OPTION_WITH_MARSHALLING if (Opts.OptimizationLevel > 0) { @@ -1888,7 +1888,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, #define CODEGEN_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef CODEGEN_OPTION_WITH_MARSHALLING // At O0 we want to fully disable inlining outside of cases marked with @@ -2379,7 +2379,7 @@ static void GenerateDependencyOutputArgs(const DependencyOutputOptions &Opts, const DependencyOutputOptions &DependencyOutputOpts = Opts; #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING if (Opts.ShowIncludesDest != ShowIncludesDestination::None) @@ -2414,7 +2414,7 @@ static bool ParseDependencyOutputArgs(DependencyOutputOptions &Opts, DependencyOutputOptions &DependencyOutputOpts = Opts; #define DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef DEPENDENCY_OUTPUT_OPTION_WITH_MARSHALLING if (Args.hasArg(OPT_show_includes)) { @@ -2542,7 +2542,7 @@ static void GenerateFileSystemArgs(const FileSystemOptions &Opts, #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING } @@ -2554,7 +2554,7 @@ static bool ParseFileSystemArgs(FileSystemOptions &Opts, const ArgList &Args, #define FILE_SYSTEM_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef FILE_SYSTEM_OPTION_WITH_MARSHALLING return Diags.getNumErrors() == NumErrorsBefore; @@ -2565,7 +2565,7 @@ static void GenerateMigratorArgs(const MigratorOptions &Opts, const MigratorOptions &MigratorOpts = Opts; #define MIGRATOR_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef MIGRATOR_OPTION_WITH_MARSHALLING } @@ -2577,7 +2577,7 @@ static bool ParseMigratorArgs(MigratorOptions &Opts, const ArgList &Args, #define MIGRATOR_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef MIGRATOR_OPTION_WITH_MARSHALLING return Diags.getNumErrors() == NumErrorsBefore; @@ -2589,7 +2589,7 @@ void CompilerInvocationBase::GenerateDiagnosticArgs( const DiagnosticOptions *DiagnosticOpts = &Opts; #define DIAG_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef DIAG_OPTION_WITH_MARSHALLING if (!Opts.DiagnosticSerializationFile.empty()) @@ -2694,7 +2694,7 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args, #define DIAG_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, *Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef DIAG_OPTION_WITH_MARSHALLING llvm::sys::Process::UseANSIEscapeCodes(Opts.UseANSIEscapeCodes); @@ -2844,7 +2844,7 @@ static void GenerateFrontendArgs(const FrontendOptions &Opts, const FrontendOptions &FrontendOpts = Opts; #define FRONTEND_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef FRONTEND_OPTION_WITH_MARSHALLING std::optional ProgramActionOpt = @@ -3014,7 +3014,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args, #define FRONTEND_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef FRONTEND_OPTION_WITH_MARSHALLING Opts.ProgramAction = frontend::ParseSyntaxOnly; @@ -3296,7 +3296,7 @@ static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts, const HeaderSearchOptions *HeaderSearchOpts = &Opts; #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING if (Opts.UseLibcxx) @@ -3411,7 +3411,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args, #define HEADER_SEARCH_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef HEADER_SEARCH_OPTION_WITH_MARSHALLING if (const Arg *A = Args.getLastArg(OPT_stdlib_EQ)) @@ -3744,7 +3744,7 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, #define LANG_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING // The '-fcf-protection=' option is generated by CodeGenOpts generator. @@ -4153,7 +4153,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, #define LANG_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef LANG_OPTION_WITH_MARSHALLING if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) { @@ -4874,7 +4874,7 @@ static void GeneratePreprocessorArgs(const PreprocessorOptions &Opts, #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef PREPROCESSOR_OPTION_WITH_MARSHALLING if (Opts.PCHWithHdrStop && !Opts.PCHWithHdrStopCreate) @@ -4948,7 +4948,7 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args, #define PREPROCESSOR_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef PREPROCESSOR_OPTION_WITH_MARSHALLING Opts.PCHWithHdrStop = Args.hasArg(OPT_pch_through_hdrstop_create) || @@ -5041,7 +5041,7 @@ GeneratePreprocessorOutputArgs(const PreprocessorOutputOptions &Opts, #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING bool Generate_dM = isStrictlyPreprocessorAction(Action) && !Opts.ShowCPP; @@ -5062,7 +5062,7 @@ static bool ParsePreprocessorOutputArgs(PreprocessorOutputOptions &Opts, #define PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef PREPROCESSOR_OUTPUT_OPTION_WITH_MARSHALLING Opts.ShowCPP = isStrictlyPreprocessorAction(Action) && !Args.hasArg(OPT_dM); @@ -5077,7 +5077,7 @@ static void GenerateTargetArgs(const TargetOptions &Opts, const TargetOptions *TargetOpts = &Opts; #define TARGET_OPTION_WITH_MARSHALLING(...) \ GENERATE_OPTION_WITH_MARSHALLING(Consumer, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef TARGET_OPTION_WITH_MARSHALLING if (!Opts.SDKVersion.empty()) @@ -5096,7 +5096,7 @@ static bool ParseTargetArgs(TargetOptions &Opts, ArgList &Args, #define TARGET_OPTION_WITH_MARSHALLING(...) \ PARSE_OPTION_WITH_MARSHALLING(Args, Diags, __VA_ARGS__) -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef TARGET_OPTION_WITH_MARSHALLING if (Arg *A = Args.getLastArg(options::OPT_target_sdk_version_EQ)) { diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp index 73b81ed906808..99212b81fe064 100644 --- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp +++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp @@ -14,11 +14,11 @@ #include "clang/Driver/Action.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/Utils.h" -#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Option/ArgList.h" @@ -61,11 +61,11 @@ clang::createInvocation(ArrayRef ArgList, if (!C) return nullptr; - if (C->getArgs().hasArg(options::OPT_fdriver_only)) + if (C->getArgs().hasArg(driver::options::OPT_fdriver_only)) return nullptr; // Just print the cc1 options if -### was present. - if (C->getArgs().hasArg(options::OPT__HASH_HASH_HASH)) { + if (C->getArgs().hasArg(driver::options::OPT__HASH_HASH_HASH)) { C->getJobs().Print(llvm::errs(), "\n", true); return nullptr; } diff --git a/clang/lib/FrontendTool/CMakeLists.txt b/clang/lib/FrontendTool/CMakeLists.txt index 66213f76eb968..061e54c3e62d0 100644 --- a/clang/lib/FrontendTool/CMakeLists.txt +++ b/clang/lib/FrontendTool/CMakeLists.txt @@ -7,7 +7,6 @@ set(link_libs clangBasic clangCodeGen clangDriver - clangOptions clangExtractAPI clangFrontend clangRewriteFrontend diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index e571193c6a9c8..c8aad4daa1c10 100644 --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -13,6 +13,7 @@ #include "clang/CodeGen/CodeGenAction.h" #include "clang/Config/config.h" +#include "clang/Driver/Options.h" #include "clang/ExtractAPI/FrontendActions.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/CompilerInvocation.h" @@ -21,7 +22,6 @@ #include "clang/Frontend/FrontendPluginRegistry.h" #include "clang/Frontend/Utils.h" #include "clang/FrontendTool/Utils.h" -#include "clang/Options/Options.h" #include "clang/Rewrite/Frontend/FrontendActions.h" #include "clang/StaticAnalyzer/Frontend/AnalyzerHelpFlags.h" #include "clang/StaticAnalyzer/Frontend/FrontendActions.h" @@ -215,11 +215,11 @@ bool ExecuteCompilerInvocation(CompilerInstance *Clang) { // Honor -help. if (Clang->getFrontendOpts().ShowHelp) { - getDriverOptTable().printHelp( + driver::getDriverOptTable().printHelp( llvm::outs(), "clang -cc1 [options] file...", "LLVM 'Clang' Compiler: http://clang.llvm.org", /*ShowHidden=*/false, /*ShowAllAliases=*/false, - llvm::opt::Visibility(options::CC1Option)); + llvm::opt::Visibility(driver::options::CC1Option)); return true; } diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp index 7764fa7dc92b9..76338065d2231 100644 --- a/clang/lib/Interpreter/Interpreter.cpp +++ b/clang/lib/Interpreter/Interpreter.cpp @@ -33,6 +33,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" +#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/FrontendAction.h" @@ -42,7 +43,6 @@ #include "clang/Interpreter/Interpreter.h" #include "clang/Interpreter/Value.h" #include "clang/Lex/PreprocessorOptions.h" -#include "clang/Options/Options.h" #include "clang/Sema/Lookup.h" #include "clang/Serialization/ObjectFilePCHContainerReader.h" #include "llvm/ExecutionEngine/JITSymbol.h" @@ -185,7 +185,7 @@ IncrementalCompilerBuilder::create(std::string TT, llvm::ArrayRef RF = llvm::ArrayRef(ClangArgv); std::unique_ptr Compilation(Driver.BuildCompilation(RF)); - if (Compilation->getArgs().hasArg(options::OPT_v)) + if (Compilation->getArgs().hasArg(driver::options::OPT_v)) Compilation->getJobs().Print(llvm::errs(), "\n", /*Quote=*/false); auto ErrOrCC1Args = GetCC1Arguments(&Diags, Compilation.get()); diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h index 4efe8b9fbc6cc..fbf9814b0d4a7 100644 --- a/clang/lib/Interpreter/InterpreterUtils.h +++ b/clang/lib/Interpreter/InterpreterUtils.h @@ -21,11 +21,11 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" +#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/TextDiagnosticBuffer.h" #include "clang/Lex/PreprocessorOptions.h" -#include "clang/Options/Options.h" #include "clang/Sema/Lookup.h" #include "llvm/IR/Module.h" diff --git a/clang/lib/Options/CMakeLists.txt b/clang/lib/Options/CMakeLists.txt deleted file mode 100644 index a762e9918b41c..0000000000000 --- a/clang/lib/Options/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -set(LLVM_LINK_COMPONENTS - Option - Support -) - -add_clang_library(clangOptions - DriverOptions.cpp - OptionUtils.cpp - - DEPENDS - ClangDriverOptions - # These generated headers are included transitively. - target_parser_gen - - LINK_LIBS - clangBasic - ${system_libs} -) diff --git a/clang/lib/Tooling/CMakeLists.txt b/clang/lib/Tooling/CMakeLists.txt index faaa53276d0e6..fc1f1f9f9d367 100644 --- a/clang/lib/Tooling/CMakeLists.txt +++ b/clang/lib/Tooling/CMakeLists.txt @@ -40,7 +40,6 @@ add_clang_library(clangTooling clangASTMatchers clangBasic clangDriver - clangOptions clangFormat clangFrontend clangLex diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp index e9b72388ae4df..28568426a6c48 100644 --- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp +++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp @@ -44,8 +44,8 @@ #include "clang/Basic/LangStandard.h" #include "clang/Driver/Driver.h" +#include "clang/Driver/Options.h" #include "clang/Driver/Types.h" -#include "clang/Options/Options.h" #include "clang/Tooling/CompilationDatabase.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -164,11 +164,11 @@ struct TransferableCommand { // We parse each argument individually so that we can retain the exact // spelling of each argument; re-rendering is lossy for aliased flags. // E.g. in CL mode, /W4 maps to -Wall. - auto &OptTable = getDriverOptTable(); + auto &OptTable = clang::driver::getDriverOptTable(); if (!OldArgs.empty()) Cmd.CommandLine.emplace_back(OldArgs.front()); for (unsigned Pos = 1; Pos < OldArgs.size();) { - using namespace options; + using namespace driver::options; const unsigned OldPos = Pos; std::unique_ptr Arg(OptTable.ParseOneArg( @@ -296,14 +296,14 @@ struct TransferableCommand { // Try to interpret the argument as a type specifier, e.g. '-x'. std::optional tryParseTypeArg(const llvm::opt::Arg &Arg) { const llvm::opt::Option &Opt = Arg.getOption(); - using namespace options; + using namespace driver::options; if (ClangCLMode) { if (Opt.matches(OPT__SLASH_TC) || Opt.matches(OPT__SLASH_Tc)) return types::TY_C; if (Opt.matches(OPT__SLASH_TP) || Opt.matches(OPT__SLASH_Tp)) return types::TY_CXX; } else { - if (Opt.matches(options::OPT_x)) + if (Opt.matches(driver::options::OPT_x)) return types::lookupTypeForTypeSpecifier(Arg.getValue()); } return std::nullopt; @@ -311,7 +311,7 @@ struct TransferableCommand { // Try to interpret the argument as '-std='. std::optional tryParseStdArg(const llvm::opt::Arg &Arg) { - using namespace options; + using namespace driver::options; if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ)) { // "c++latest" is not a recognized LangStandard, but it's accepted by // the clang driver in CL mode. diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp index 1f6a5c94601fc..e8eef5ed9c9fa 100644 --- a/clang/lib/Tooling/Tooling.cpp +++ b/clang/lib/Tooling/Tooling.cpp @@ -21,6 +21,7 @@ #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/Job.h" +#include "clang/Driver/Options.h" #include "clang/Driver/Tool.h" #include "clang/Driver/ToolChain.h" #include "clang/Frontend/ASTUnit.h" @@ -31,7 +32,6 @@ #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Lex/HeaderSearchOptions.h" #include "clang/Lex/PreprocessorOptions.h" -#include "clang/Options/Options.h" #include "clang/Tooling/ArgumentsAdjusters.h" #include "clang/Tooling/CompilationDatabase.h" #include "llvm/ADT/ArrayRef.h" @@ -270,15 +270,17 @@ void addTargetAndModeForProgramName(std::vector &CommandLine, StringRef InvokedAs) { if (CommandLine.empty() || InvokedAs.empty()) return; - const auto &Table = getDriverOptTable(); + const auto &Table = driver::getDriverOptTable(); // --target=X - StringRef TargetOPT = Table.getOption(options::OPT_target).getPrefixedName(); + StringRef TargetOPT = + Table.getOption(driver::options::OPT_target).getPrefixedName(); // -target X StringRef TargetOPTLegacy = - Table.getOption(options::OPT_target_legacy_spelling).getPrefixedName(); + Table.getOption(driver::options::OPT_target_legacy_spelling) + .getPrefixedName(); // --driver-mode=X StringRef DriverModeOPT = - Table.getOption(options::OPT_driver_mode).getPrefixedName(); + Table.getOption(driver::options::OPT_driver_mode).getPrefixedName(); auto TargetMode = driver::ToolChain::getTargetAndModeFromProgramName(InvokedAs); // No need to search for target args if we don't have a target/mode to insert. diff --git a/clang/tools/clang-check/CMakeLists.txt b/clang/tools/clang-check/CMakeLists.txt index 1efcc91fcaec9..5493aa4237aee 100644 --- a/clang/tools/clang-check/CMakeLists.txt +++ b/clang/tools/clang-check/CMakeLists.txt @@ -14,7 +14,6 @@ clang_target_link_libraries(clang-check clangBasic clangDriver clangFrontend - clangOptions clangRewriteFrontend clangSerialization clangStaticAnalyzerFrontend diff --git a/clang/tools/clang-check/ClangCheck.cpp b/clang/tools/clang-check/ClangCheck.cpp index 80255c647b98f..fa6dd06a1ee58 100644 --- a/clang/tools/clang-check/ClangCheck.cpp +++ b/clang/tools/clang-check/ClangCheck.cpp @@ -16,9 +16,9 @@ //===----------------------------------------------------------------------===// #include "clang/AST/ASTConsumer.h" +#include "clang/Driver/Options.h" #include "clang/Frontend/ASTConsumers.h" #include "clang/Frontend/CompilerInstance.h" -#include "clang/Options/Options.h" #include "clang/Rewrite/Frontend/FixItRewriter.h" #include "clang/Rewrite/Frontend/FrontendActions.h" #include "clang/StaticAnalyzer/Frontend/FrontendActions.h" @@ -34,8 +34,8 @@ #include "llvm/Support/Signals.h" #include "llvm/Support/TargetSelect.h" +using namespace clang::driver; using namespace clang::tooling; -using namespace clang; using namespace llvm; static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage); diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt index 54bc80486472f..9c0d9dff7dc7f 100644 --- a/clang/tools/clang-installapi/CMakeLists.txt +++ b/clang/tools/clang-installapi/CMakeLists.txt @@ -25,7 +25,6 @@ clang_target_link_libraries(clang-installapi clangAST clangInstallAPI clangBasic - clangOptions clangDriver clangFrontend clangTooling diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp index 8bef9690ad855..4e66485343b89 100644 --- a/clang/tools/clang-installapi/ClangInstallAPI.cpp +++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp @@ -35,7 +35,7 @@ using namespace clang; using namespace clang::installapi; -using namespace clang::options; +using namespace clang::driver::options; using namespace llvm::opt; using namespace llvm::MachO; @@ -71,7 +71,7 @@ static bool runFrontend(StringRef ProgName, Twine Label, bool Verbose, static bool run(ArrayRef Args, const char *ProgName) { // Setup Diagnostics engine. DiagnosticOptions DiagOpts; - const llvm::opt::OptTable &ClangOpts = getDriverOptTable(); + const llvm::opt::OptTable &ClangOpts = clang::driver::getDriverOptTable(); unsigned MissingArgIndex, MissingArgCount; llvm::opt::InputArgList ParsedArgs = ClangOpts.ParseArgs( ArrayRef(Args).slice(1), MissingArgIndex, MissingArgCount); diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp index f484d6f33ad8f..64324a3f8b010 100644 --- a/clang/tools/clang-installapi/Options.cpp +++ b/clang/tools/clang-installapi/Options.cpp @@ -26,6 +26,8 @@ using namespace llvm; using namespace llvm::opt; using namespace llvm::MachO; +namespace drv = clang::driver::options; + namespace clang { namespace installapi { @@ -107,7 +109,7 @@ getArgListFromJSON(const StringRef Input, llvm::opt::OptTable *Table, bool Options::processDriverOptions(InputArgList &Args) { // Handle inputs. - for (const StringRef Path : Args.getAllArgValues(options::OPT_INPUT)) { + for (const StringRef Path : Args.getAllArgValues(drv::OPT_INPUT)) { // Assume any input that is not a directory is a filelist. // InstallAPI does not accept multiple directories, so retain the last one. if (FM->getOptionalDirectoryRef(Path)) @@ -118,7 +120,7 @@ bool Options::processDriverOptions(InputArgList &Args) { // Handle output. SmallString OutputPath; - if (auto *Arg = Args.getLastArg(options::OPT_o)) { + if (auto *Arg = Args.getLastArg(drv::OPT_o)) { OutputPath = Arg->getValue(); if (OutputPath != "-") FM->makeAbsolutePath(OutputPath); @@ -130,10 +132,10 @@ bool Options::processDriverOptions(InputArgList &Args) { } // Do basic error checking first for mixing -target and -arch options. - auto *ArgArch = Args.getLastArgNoClaim(options::OPT_arch); - auto *ArgTarget = Args.getLastArgNoClaim(options::OPT_target); + auto *ArgArch = Args.getLastArgNoClaim(drv::OPT_arch); + auto *ArgTarget = Args.getLastArgNoClaim(drv::OPT_target); auto *ArgTargetVariant = - Args.getLastArgNoClaim(options::OPT_darwin_target_variant); + Args.getLastArgNoClaim(drv::OPT_darwin_target_variant); if (ArgArch && (ArgTarget || ArgTargetVariant)) { Diags->Report(clang::diag::err_drv_argument_not_allowed_with) << ArgArch->getAsString(Args) @@ -141,7 +143,7 @@ bool Options::processDriverOptions(InputArgList &Args) { return false; } - auto *ArgMinTargetOS = Args.getLastArgNoClaim(options::OPT_mtargetos_EQ); + auto *ArgMinTargetOS = Args.getLastArgNoClaim(drv::OPT_mtargetos_EQ); if ((ArgTarget || ArgTargetVariant) && ArgMinTargetOS) { Diags->Report(clang::diag::err_drv_cannot_mix_options) << ArgTarget->getAsString(Args) << ArgMinTargetOS->getAsString(Args); @@ -150,7 +152,7 @@ bool Options::processDriverOptions(InputArgList &Args) { // Capture target triples first. if (ArgTarget) { - for (const Arg *A : Args.filtered(options::OPT_target)) { + for (const Arg *A : Args.filtered(drv::OPT_target)) { A->claim(); llvm::Triple TargetTriple(A->getValue()); Target TAPITarget = Target(TargetTriple); @@ -166,7 +168,7 @@ bool Options::processDriverOptions(InputArgList &Args) { // Capture target variants. DriverOpts.Zippered = ArgTargetVariant != nullptr; - for (Arg *A : Args.filtered(options::OPT_darwin_target_variant)) { + for (Arg *A : Args.filtered(drv::OPT_darwin_target_variant)) { A->claim(); Triple Variant(A->getValue()); if (Variant.getVendor() != Triple::Apple) { @@ -211,7 +213,7 @@ bool Options::processDriverOptions(InputArgList &Args) { DriverOpts.Targets[TAPIVariant] = Variant; } - DriverOpts.Verbose = Args.hasArgNoClaim(options::OPT_v); + DriverOpts.Verbose = Args.hasArgNoClaim(drv::OPT_v); return true; } @@ -405,7 +407,7 @@ bool Options::processOptionList(InputArgList &Args, bool Options::processLinkerOptions(InputArgList &Args) { // Handle required arguments. - if (const Arg *A = Args.getLastArg(options::OPT_install__name)) + if (const Arg *A = Args.getLastArg(drv::OPT_install__name)) LinkerOpts.InstallName = A->getValue(); if (LinkerOpts.InstallName.empty()) { Diags->Report(diag::err_no_install_name); @@ -413,29 +415,28 @@ bool Options::processLinkerOptions(InputArgList &Args) { } // Defaulted or optional arguments. - if (auto *Arg = Args.getLastArg(options::OPT_current__version)) + if (auto *Arg = Args.getLastArg(drv::OPT_current__version)) LinkerOpts.CurrentVersion.parse64(Arg->getValue()); - if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version)) + if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version)) LinkerOpts.CompatVersion.parse64(Arg->getValue()); - if (auto *Arg = Args.getLastArg(options::OPT_compatibility__version)) + if (auto *Arg = Args.getLastArg(drv::OPT_compatibility__version)) LinkerOpts.CompatVersion.parse64(Arg->getValue()); - if (auto *Arg = Args.getLastArg(options::OPT_umbrella)) + if (auto *Arg = Args.getLastArg(drv::OPT_umbrella)) LinkerOpts.ParentUmbrella = Arg->getValue(); - LinkerOpts.IsDylib = Args.hasArg(options::OPT_dynamiclib); + LinkerOpts.IsDylib = Args.hasArg(drv::OPT_dynamiclib); - for (auto *Arg : Args.filtered(options::OPT_alias_list)) { + for (auto *Arg : Args.filtered(drv::OPT_alias_list)) { LinkerOpts.AliasLists.emplace_back(Arg->getValue()); Arg->claim(); } - LinkerOpts.AppExtensionSafe = - Args.hasFlag(options::OPT_fapplication_extension, - options::OPT_fno_application_extension, - /*Default=*/LinkerOpts.AppExtensionSafe); + LinkerOpts.AppExtensionSafe = Args.hasFlag( + drv::OPT_fapplication_extension, drv::OPT_fno_application_extension, + /*Default=*/LinkerOpts.AppExtensionSafe); if (::getenv("LD_NO_ENCRYPT") != nullptr) LinkerOpts.AppExtensionSafe = true; @@ -445,7 +446,7 @@ bool Options::processLinkerOptions(InputArgList &Args) { // Capture library paths. PathSeq LibraryPaths; - for (const Arg *A : Args.filtered(options::OPT_L)) { + for (const Arg *A : Args.filtered(drv::OPT_L)) { LibraryPaths.emplace_back(A->getValue()); A->claim(); } @@ -460,7 +461,7 @@ bool Options::processLinkerOptions(InputArgList &Args) { // invocations. bool Options::processFrontendOptions(InputArgList &Args) { // Capture language mode. - if (auto *A = Args.getLastArgNoClaim(options::OPT_x)) { + if (auto *A = Args.getLastArgNoClaim(drv::OPT_x)) { FEOpts.LangMode = llvm::StringSwitch(A->getValue()) .Case("c", clang::Language::C) .Case("c++", clang::Language::CXX) @@ -474,15 +475,15 @@ bool Options::processFrontendOptions(InputArgList &Args) { return false; } } - for (auto *A : Args.filtered(options::OPT_ObjC, options::OPT_ObjCXX)) { - if (A->getOption().matches(options::OPT_ObjC)) + for (auto *A : Args.filtered(drv::OPT_ObjC, drv::OPT_ObjCXX)) { + if (A->getOption().matches(drv::OPT_ObjC)) FEOpts.LangMode = clang::Language::ObjC; else FEOpts.LangMode = clang::Language::ObjCXX; } // Capture Sysroot. - if (const Arg *A = Args.getLastArgNoClaim(options::OPT_isysroot)) { + if (const Arg *A = Args.getLastArgNoClaim(drv::OPT_isysroot)) { SmallString Path(A->getValue()); FM->makeAbsolutePath(Path); if (!FM->getOptionalDirectoryRef(Path)) { @@ -501,13 +502,13 @@ bool Options::processFrontendOptions(InputArgList &Args) { } // Capture system frameworks for all platforms. - for (const Arg *A : Args.filtered(options::OPT_iframework)) + for (const Arg *A : Args.filtered(drv::OPT_iframework)) FEOpts.SystemFwkPaths.emplace_back(A->getValue(), std::optional{}); // Capture framework paths. PathSeq FrameworkPaths; - for (const Arg *A : Args.filtered(options::OPT_F)) + for (const Arg *A : Args.filtered(drv::OPT_F)) FrameworkPaths.emplace_back(A->getValue()); if (!FrameworkPaths.empty()) diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt index db9e0de9b59e0..a9fa61f9f75ee 100644 --- a/clang/tools/driver/CMakeLists.txt +++ b/clang/tools/driver/CMakeLists.txt @@ -63,7 +63,6 @@ clang_target_link_libraries(clang clangDriver clangFrontend clangFrontendTool - clangOptions clangSerialization ) diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp index 2aef75597fc5f..52cffa4ccbe1f 100644 --- a/clang/tools/driver/cc1_main.cpp +++ b/clang/tools/driver/cc1_main.cpp @@ -17,6 +17,7 @@ #include "clang/CodeGen/ObjectFilePCHContainerWriter.h" #include "clang/Config/config.h" #include "clang/Driver/DriverDiagnostic.h" +#include "clang/Driver/Options.h" #include "clang/Frontend/CompilerInstance.h" #include "clang/Frontend/CompilerInvocation.h" #include "clang/Frontend/FrontendDiagnostic.h" @@ -24,7 +25,6 @@ #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" #include "clang/FrontendTool/Utils.h" -#include "clang/Options/Options.h" #include "clang/Serialization/ObjectFilePCHContainerReader.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp index f13812f2a8383..50da2f8449a22 100644 --- a/clang/tools/driver/cc1as_main.cpp +++ b/clang/tools/driver/cc1as_main.cpp @@ -14,10 +14,10 @@ #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticOptions.h" #include "clang/Driver/DriverDiagnostic.h" +#include "clang/Driver/Options.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" -#include "clang/Options/Options.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" @@ -59,7 +59,8 @@ #include #include using namespace clang; -using namespace clang::options; +using namespace clang::driver; +using namespace clang::driver::options; using namespace llvm; using namespace llvm::opt; @@ -687,7 +688,8 @@ int cc1as_main(ArrayRef Argv, const char *Argv0, void *MainAddr) { getDriverOptTable().printHelp( llvm::outs(), "clang -cc1as [options] file...", "Clang Integrated Assembler", /*ShowHidden=*/false, - /*ShowAllAliases=*/false, llvm::opt::Visibility(options::CC1AsOption)); + /*ShowAllAliases=*/false, + llvm::opt::Visibility(driver::options::CC1AsOption)); return 0; } diff --git a/clang/tools/driver/driver.cpp b/clang/tools/driver/driver.cpp index dffa6ba0e4c91..6cf64d70c9399 100644 --- a/clang/tools/driver/driver.cpp +++ b/clang/tools/driver/driver.cpp @@ -18,13 +18,13 @@ #include "clang/Config/config.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/DriverDiagnostic.h" +#include "clang/Driver/Options.h" #include "clang/Driver/ToolChain.h" #include "clang/Frontend/ChainedDiagnosticConsumer.h" #include "clang/Frontend/CompilerInvocation.h" #include "clang/Frontend/SerializedDiagnosticPrinter.h" #include "clang/Frontend/TextDiagnosticPrinter.h" #include "clang/Frontend/Utils.h" -#include "clang/Options/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp index e0454f190b35a..62274235c53f5 100644 --- a/clang/unittests/Driver/DXCModeTest.cpp +++ b/clang/unittests/Driver/DXCModeTest.cpp @@ -131,8 +131,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) { TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None)}; EXPECT_NE(TranslatedArgs, nullptr); if (TranslatedArgs) { - auto *A = - TranslatedArgs->getLastArg(clang::options::OPT_dxil_validator_version); + auto *A = TranslatedArgs->getLastArg( + clang::driver::options::OPT_dxil_validator_version); EXPECT_NE(A, nullptr); if (A) { EXPECT_STREQ(A->getValue(), "1.1"); diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html index 3e5e84b5b2ed4..ae0ec1d4d12cb 100755 --- a/clang/www/OpenProjects.html +++ b/clang/www/OpenProjects.html @@ -38,7 +38,7 @@

    Open Clang Projects

  • documenting diagnostic group flags (adding code examples of what is diagnosed, or other relevant information), or
  • -
  • documenting +
  • documenting command line options, or
  • help with completing other missing documentation.
  • diff --git a/flang/docs/CMakeLists.txt b/flang/docs/CMakeLists.txt index b183d6add1059..568f942cb4aa6 100644 --- a/flang/docs/CMakeLists.txt +++ b/flang/docs/CMakeLists.txt @@ -88,7 +88,7 @@ function (gen_rst_file_from_td output_file td_option source target) endif() get_filename_component(TABLEGEN_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${source}" DIRECTORY) list(APPEND LLVM_TABLEGEN_FLAGS "-I${TABLEGEN_INCLUDE_DIR}") - list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Options/") + list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Driver/") clang_tablegen(Source/${output_file} ${td_option} SOURCE ${source} TARGET ${target}) endfunction() diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md index 9953f2252218b..3286171bb1499 100644 --- a/flang/docs/FlangDriver.md +++ b/flang/docs/FlangDriver.md @@ -76,7 +76,7 @@ will ignore it when used without `Xflang`. As hinted above, `flang` and `flang -fc1` are two separate tools. The fact that these tools are accessed through one binary, `flang`, is just an implementation detail. Each tool has a separate list of options, albeit defined -in the same file: `clang/include/clang/Options/Options.td`. +in the same file: `clang/include/clang/Driver/Options.td`. The separation helps us split various tasks and allows us to implement more specialised tools. In particular, `flang` is not aware of various @@ -112,7 +112,7 @@ in terms of Clang's driver library, `clangDriver`. This approach allows us to: as linkers and assemblers. One implication of this dependency on Clang is that all of Flang's compiler options are defined alongside Clang's options in -`clang/include/clang/Options/Options.td`. For options that are common for both +`clang/include/clang/Driver/Options.td`. For options that are common for both Flang and Clang, the corresponding definitions are shared. Internally, a `clangDriver` based compiler driver works by creating actions @@ -242,7 +242,7 @@ Adding a new compiler option in Flang consists of two steps: ### Option Definition All of Flang's compiler and frontend driver options are defined in -`clang/include/clang/Options/Options.td` in Clang. When adding a new option to +`clang/include/clang/Driver/Options.td` in Clang. When adding a new option to Flang, you will either: * extend the existing definition for an option that is already available in one of Clang's drivers (e.g. `clang`), but not yet available in Flang, or @@ -314,7 +314,7 @@ add, you will have to add a dedicated entry in that enum (e.g. `ParseSyntaxOnly` for `-fsyntax-only`) and a corresponding `case` in `ParseFrontendArgs` function in the `CompilerInvocation.cpp` file, e.g.: ```cpp - case clang::options::OPT_fsyntax_only: + case clang::driver::options::OPT_fsyntax_only: opts.programAction = ParseSyntaxOnly; break; ``` diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt index bb0b4a39cec9b..2b3bc0e9c2269 100644 --- a/flang/lib/Frontend/CMakeLists.txt +++ b/flang/lib/Frontend/CMakeLists.txt @@ -76,7 +76,6 @@ add_flang_library(flangFrontend CLANG_LIBS clangBasic clangDriver - clangOptions ) target_precompile_headers(flangFrontend PRIVATE diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 8cb9a24cd58b8..5eba5e4cc8a53 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -26,8 +26,8 @@ #include "clang/Basic/DiagnosticOptions.h" #include "clang/Driver/CommonArgs.h" #include "clang/Driver/Driver.h" -#include "clang/Options/OptionUtils.h" -#include "clang/Options/Options.h" +#include "clang/Driver/OptionUtils.h" +#include "clang/Driver/Options.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -82,11 +82,11 @@ static bool parseShowColorsArgs(const llvm::opt::ArgList &args, for (auto *a : args) { const llvm::opt::Option &opt = a->getOption(); - if (opt.matches(clang::options::OPT_fcolor_diagnostics)) { + if (opt.matches(clang::driver::options::OPT_fcolor_diagnostics)) { showColors = Colors_On; - } else if (opt.matches(clang::options::OPT_fno_color_diagnostics)) { + } else if (opt.matches(clang::driver::options::OPT_fno_color_diagnostics)) { showColors = Colors_Off; - } else if (opt.matches(clang::options::OPT_fdiagnostics_color_EQ)) { + } else if (opt.matches(clang::driver::options::OPT_fdiagnostics_color_EQ)) { llvm::StringRef value(a->getValue()); if (value == "always") showColors = Colors_On; @@ -107,13 +107,15 @@ static unsigned getOptimizationLevel(llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { unsigned defaultOpt = 0; - if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_O_Group)) { - if (a->getOption().matches(clang::options::OPT_O0)) + if (llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_O_Group)) { + if (a->getOption().matches(clang::driver::options::OPT_O0)) return 0; - assert(a->getOption().matches(clang::options::OPT_O)); + assert(a->getOption().matches(clang::driver::options::OPT_O)); - return getLastArgIntValue(args, clang::options::OPT_O, defaultOpt, diags); + return getLastArgIntValue(args, clang::driver::options::OPT_O, defaultOpt, + diags); } return defaultOpt; @@ -131,7 +133,7 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, clang::DiagnosticsEngine &diags) { using DebugInfoKind = llvm::codegenoptions::DebugInfoKind; if (llvm::opt::Arg *arg = - args.getLastArg(clang::options::OPT_debug_info_kind_EQ)) { + args.getLastArg(clang::driver::options::OPT_debug_info_kind_EQ)) { std::optional val = llvm::StringSwitch>(arg->getValue()) .Case("line-tables-only", llvm::codegenoptions::DebugLineTablesOnly) @@ -156,13 +158,13 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, diags.Report(debugWarning) << arg->getValue(); } opts.DwarfVersion = - getLastArgIntValue(args, clang::options::OPT_dwarf_version_EQ, + getLastArgIntValue(args, clang::driver::options::OPT_dwarf_version_EQ, /*Default=*/0, diags); if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_split_dwarf_file)) + args.getLastArg(clang::driver::options::OPT_split_dwarf_file)) opts.SplitDwarfFile = a->getValue(); if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_split_dwarf_output)) + args.getLastArg(clang::driver::options::OPT_split_dwarf_output)) opts.SplitDwarfOutput = a->getValue(); } return true; @@ -172,7 +174,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { llvm::opt::Arg *arg = - args.getLastArg(clang::options::OPT_fdo_concurrent_to_openmp_EQ); + args.getLastArg(clang::driver::options::OPT_fdo_concurrent_to_openmp_EQ); if (!arg) return; @@ -197,7 +199,7 @@ static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fveclib); + llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fveclib); if (!arg) return true; @@ -235,7 +237,7 @@ parseOptimizationRemark(clang::DiagnosticsEngine &diags, CodeGenOptions::OptRemark result; for (llvm::opt::Arg *a : args) { - if (a->getOption().matches(clang::options::OPT_R_Joined)) { + if (a->getOption().matches(clang::driver::options::OPT_R_Joined)) { llvm::StringRef value = a->getValue(); if (value == remarkOptName) { @@ -272,39 +274,39 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::DiagnosticsEngine &diags) { opts.OptimizationLevel = getOptimizationLevel(args, diags); - if (args.hasFlag(clang::options::OPT_fdebug_pass_manager, - clang::options::OPT_fno_debug_pass_manager, false)) + if (args.hasFlag(clang::driver::options::OPT_fdebug_pass_manager, + clang::driver::options::OPT_fno_debug_pass_manager, false)) opts.DebugPassManager = 1; - if (args.hasFlag(clang::options::OPT_fstack_arrays, - clang::options::OPT_fno_stack_arrays, false)) + if (args.hasFlag(clang::driver::options::OPT_fstack_arrays, + clang::driver::options::OPT_fno_stack_arrays, false)) opts.StackArrays = 1; - if (args.getLastArg(clang::options::OPT_floop_interchange)) + if (args.getLastArg(clang::driver::options::OPT_floop_interchange)) opts.InterchangeLoops = 1; - if (args.getLastArg(clang::options::OPT_fexperimental_loop_fusion)) + if (args.getLastArg(clang::driver::options::OPT_fexperimental_loop_fusion)) opts.FuseLoops = 1; - if (args.getLastArg(clang::options::OPT_vectorize_loops)) + if (args.getLastArg(clang::driver::options::OPT_vectorize_loops)) opts.VectorizeLoop = 1; - if (args.getLastArg(clang::options::OPT_vectorize_slp)) + if (args.getLastArg(clang::driver::options::OPT_vectorize_slp)) opts.VectorizeSLP = 1; - if (args.hasFlag(clang::options::OPT_floop_versioning, - clang::options::OPT_fno_loop_versioning, false)) + if (args.hasFlag(clang::driver::options::OPT_floop_versioning, + clang::driver::options::OPT_fno_loop_versioning, false)) opts.LoopVersioning = 1; - opts.UnrollLoops = args.hasFlag(clang::options::OPT_funroll_loops, - clang::options::OPT_fno_unroll_loops, + opts.UnrollLoops = args.hasFlag(clang::driver::options::OPT_funroll_loops, + clang::driver::options::OPT_fno_unroll_loops, (opts.OptimizationLevel > 1)); opts.AliasAnalysis = opts.OptimizationLevel > 0; // -mframe-pointer=none/non-leaf/reserved/all option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_mframe_pointer_EQ)) { + args.getLastArg(clang::driver::options::OPT_mframe_pointer_EQ)) { std::optional val = llvm::StringSwitch>(a->getValue()) .Case("none", llvm::FramePointerKind::None) @@ -320,7 +322,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, opts.setFramePointer(val.value()); } - for (auto *a : args.filtered(clang::options::OPT_fpass_plugin_EQ)) + for (auto *a : args.filtered(clang::driver::options::OPT_fpass_plugin_EQ)) opts.LLVMPassPlugins.push_back(a->getValue()); opts.Reciprocals = clang::driver::tools::parseMRecipOption(diags, args); @@ -329,14 +331,15 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::driver::tools::parseMPreferVectorWidthOption(diags, args); // -fembed-offload-object option - for (auto *a : args.filtered(clang::options::OPT_fembed_offload_object_EQ)) + for (auto *a : + args.filtered(clang::driver::options::OPT_fembed_offload_object_EQ)) opts.OffloadObjects.push_back(a->getValue()); - if (args.hasArg(clang::options::OPT_finstrument_functions)) + if (args.hasArg(clang::driver::options::OPT_finstrument_functions)) opts.InstrumentFunctions = 1; - if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_mcode_object_version_EQ)) { + if (const llvm::opt::Arg *a = args.getLastArg( + clang::driver::options::OPT_mcode_object_version_EQ)) { llvm::StringRef s = a->getValue(); if (s == "6") opts.CodeObjectVersion = llvm::CodeObjectVersionKind::COV_6; @@ -350,36 +353,36 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, // -f[no-]save-optimization-record[=] if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_opt_record_file)) + args.getLastArg(clang::driver::options::OPT_opt_record_file)) opts.OptRecordFile = a->getValue(); // Optimization file format. Defaults to yaml if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_opt_record_format)) + args.getLastArg(clang::driver::options::OPT_opt_record_format)) opts.OptRecordFormat = a->getValue(); // Specifies, using a regex, which successful optimization passes(middle and // backend), to include in the final optimization record file generated. If // not provided -fsave-optimization-record will include all passes. if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_opt_record_passes)) + args.getLastArg(clang::driver::options::OPT_opt_record_passes)) opts.OptRecordPasses = a->getValue(); // Create OptRemark that allows printing of all successful optimization // passes applied. opts.OptimizationRemark = - parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_EQ, + parseOptimizationRemark(diags, args, clang::driver::options::OPT_Rpass_EQ, /*remarkOptName=*/"pass"); // Create OptRemark that allows all missed optimization passes to be printed. - opts.OptimizationRemarkMissed = - parseOptimizationRemark(diags, args, clang::options::OPT_Rpass_missed_EQ, - /*remarkOptName=*/"pass-missed"); + opts.OptimizationRemarkMissed = parseOptimizationRemark( + diags, args, clang::driver::options::OPT_Rpass_missed_EQ, + /*remarkOptName=*/"pass-missed"); // Create OptRemark that allows all optimization decisions made by LLVM // to be printed. opts.OptimizationRemarkAnalysis = parseOptimizationRemark( - diags, args, clang::options::OPT_Rpass_analysis_EQ, + diags, args, clang::driver::options::OPT_Rpass_analysis_EQ, /*remarkOptName=*/"pass-analysis"); if (opts.getDebugInfo() == llvm::codegenoptions::NoDebugInfo) { @@ -397,22 +400,23 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, opts.setDebugInfo(llvm::codegenoptions::LocTrackingOnly); } - if (auto *a = args.getLastArg(clang::options::OPT_save_temps_EQ)) + if (auto *a = args.getLastArg(clang::driver::options::OPT_save_temps_EQ)) opts.SaveTempsDir = a->getValue(); // -record-command-line option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_record_command_line)) { + args.getLastArg(clang::driver::options::OPT_record_command_line)) { opts.RecordCommandLine = a->getValue(); } // -mlink-builtin-bitcode - for (auto *a : args.filtered(clang::options::OPT_mlink_builtin_bitcode)) + for (auto *a : + args.filtered(clang::driver::options::OPT_mlink_builtin_bitcode)) opts.BuiltinBCLibs.push_back(a->getValue()); // -mrelocation-model option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_mrelocation_model)) { + args.getLastArg(clang::driver::options::OPT_mrelocation_model)) { llvm::StringRef modelName = a->getValue(); auto relocModel = llvm::StringSwitch>(modelName) @@ -431,30 +435,31 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, } // -pic-level and -pic-is-pie option. - if (int picLevel = - getLastArgIntValue(args, clang::options::OPT_pic_level, 0, diags)) { + if (int picLevel = getLastArgIntValue( + args, clang::driver::options::OPT_pic_level, 0, diags)) { if (picLevel > 2) diags.Report(clang::diag::err_drv_invalid_value) - << args.getLastArg(clang::options::OPT_pic_level)->getAsString(args) + << args.getLastArg(clang::driver::options::OPT_pic_level) + ->getAsString(args) << picLevel; opts.PICLevel = picLevel; - if (args.hasArg(clang::options::OPT_pic_is_pie)) + if (args.hasArg(clang::driver::options::OPT_pic_is_pie)) opts.IsPIE = 1; } - if (args.hasArg(clang::options::OPT_fprofile_generate)) { + if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) { opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr); } - if (auto A = args.getLastArg(clang::options::OPT_fprofile_use_EQ)) { + if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) { opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr); opts.ProfileInstrumentUsePath = A->getValue(); } // -mcmodel option. if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_mcmodel_EQ)) { + args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) { llvm::StringRef modelName = a->getValue(); std::optional codeModel = getCodeModel(modelName); @@ -465,8 +470,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, << a->getAsString(args) << modelName; } - if (const llvm::opt::Arg *arg = - args.getLastArg(clang::options::OPT_mlarge_data_threshold_EQ)) { + if (const llvm::opt::Arg *arg = args.getLastArg( + clang::driver::options::OPT_mlarge_data_threshold_EQ)) { uint64_t LDT; if (llvm::StringRef(arg->getValue()).getAsInteger(/*Radix=*/10, LDT)) { diags.Report(clang::diag::err_drv_invalid_value) @@ -476,8 +481,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, } // This option is compatible with -f[no-]underscoring in gfortran. - if (args.hasFlag(clang::options::OPT_fno_underscoring, - clang::options::OPT_funderscoring, false)) { + if (args.hasFlag(clang::driver::options::OPT_fno_underscoring, + clang::driver::options::OPT_funderscoring, false)) { opts.Underscoring = 0; } @@ -494,7 +499,7 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::driver::options::OPT_fno_defer_desc_map, true); if (const llvm::opt::Arg *arg = - args.getLastArg(clang::options::OPT_complex_range_EQ)) { + args.getLastArg(clang::driver::options::OPT_complex_range_EQ)) { llvm::StringRef argValue = llvm::StringRef(arg->getValue()); if (argValue == "full") { opts.setComplexRange(CodeGenOptions::ComplexRangeKind::CX_Full); @@ -515,42 +520,46 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, /// \param [in] opts The target options instance to update /// \param [in] args The list of input arguments (from the compiler invocation) static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { - if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_triple)) + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_triple)) opts.triple = a->getValue(); - opts.atomicIgnoreDenormalMode = - args.hasFlag(clang::options::OPT_fatomic_ignore_denormal_mode, - clang::options::OPT_fno_atomic_ignore_denormal_mode, false); - opts.atomicFineGrainedMemory = - args.hasFlag(clang::options::OPT_fatomic_fine_grained_memory, - clang::options::OPT_fno_atomic_fine_grained_memory, false); + opts.atomicIgnoreDenormalMode = args.hasFlag( + clang::driver::options::OPT_fatomic_ignore_denormal_mode, + clang::driver::options::OPT_fno_atomic_ignore_denormal_mode, false); + opts.atomicFineGrainedMemory = args.hasFlag( + clang::driver::options::OPT_fatomic_fine_grained_memory, + clang::driver::options::OPT_fno_atomic_fine_grained_memory, false); opts.atomicRemoteMemory = - args.hasFlag(clang::options::OPT_fatomic_remote_memory, - clang::options::OPT_fno_atomic_remote_memory, false); + args.hasFlag(clang::driver::options::OPT_fatomic_remote_memory, + clang::driver::options::OPT_fno_atomic_remote_memory, false); - if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_target_cpu)) + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_target_cpu)) opts.cpu = a->getValue(); - if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_tune_cpu)) + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_tune_cpu)) opts.cpuToTuneFor = a->getValue(); for (const llvm::opt::Arg *currentArg : - args.filtered(clang::options::OPT_target_feature)) + args.filtered(clang::driver::options::OPT_target_feature)) opts.featuresAsWritten.emplace_back(currentArg->getValue()); - if (args.hasArg(clang::options::OPT_fdisable_real_10)) + if (args.hasArg(clang::driver::options::OPT_fdisable_real_10)) opts.disabledRealKinds.push_back(10); - if (args.hasArg(clang::options::OPT_fdisable_real_3)) + if (args.hasArg(clang::driver::options::OPT_fdisable_real_3)) opts.disabledRealKinds.push_back(3); - if (args.hasArg(clang::options::OPT_fdisable_integer_2)) + if (args.hasArg(clang::driver::options::OPT_fdisable_integer_2)) opts.disabledIntegerKinds.push_back(2); - if (args.hasArg(clang::options::OPT_fdisable_integer_16)) + if (args.hasArg(clang::driver::options::OPT_fdisable_integer_16)) opts.disabledIntegerKinds.push_back(16); - if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_mabi_EQ)) { + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_mabi_EQ)) { opts.abi = a->getValue(); llvm::StringRef V = a->getValue(); if (V == "vec-extabi") { @@ -560,8 +569,9 @@ static void parseTargetArgs(TargetOptions &opts, llvm::opt::ArgList &args) { } } - opts.asmVerbose = args.hasFlag(clang::options::OPT_fverbose_asm, - clang::options::OPT_fno_verbose_asm, false); + opts.asmVerbose = + args.hasFlag(clang::driver::options::OPT_fverbose_asm, + clang::driver::options::OPT_fno_verbose_asm, false); } // Tweak the frontend configuration based on the frontend action static void setUpFrontendBasedOnAction(FrontendOptions &opts) { @@ -594,11 +604,11 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Treat multiple action options as an invocation error. Note that `clang // -cc1` does accept multiple action options, but will only consider the // rightmost one. - if (args.hasMultipleArgs(clang::options::OPT_Action_Group)) { + if (args.hasMultipleArgs(clang::driver::options::OPT_Action_Group)) { llvm::SmallString<32> buf; llvm::raw_svector_ostream os(buf); for (const llvm::opt::Arg *arg : - args.filtered(clang::options::OPT_Action_Group)) { + args.filtered(clang::driver::options::OPT_Action_Group)) { if (buf.size()) os << ", "; os << "'" << arg->getSpelling() << "'"; @@ -609,99 +619,99 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Identify the action (i.e. opts.ProgramAction) if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_Action_Group)) { + args.getLastArg(clang::driver::options::OPT_Action_Group)) { switch (a->getOption().getID()) { default: { llvm_unreachable("Invalid option in group!"); } - case clang::options::OPT_test_io: + case clang::driver::options::OPT_test_io: opts.programAction = InputOutputTest; break; - case clang::options::OPT_E: + case clang::driver::options::OPT_E: opts.programAction = PrintPreprocessedInput; break; - case clang::options::OPT_fsyntax_only: + case clang::driver::options::OPT_fsyntax_only: opts.programAction = ParseSyntaxOnly; break; - case clang::options::OPT_emit_fir: + case clang::driver::options::OPT_emit_fir: opts.programAction = EmitFIR; break; - case clang::options::OPT_emit_hlfir: + case clang::driver::options::OPT_emit_hlfir: opts.programAction = EmitHLFIR; break; - case clang::options::OPT_emit_llvm: + case clang::driver::options::OPT_emit_llvm: opts.programAction = EmitLLVM; break; - case clang::options::OPT_emit_llvm_bc: + case clang::driver::options::OPT_emit_llvm_bc: opts.programAction = EmitLLVMBitcode; break; - case clang::options::OPT_emit_obj: + case clang::driver::options::OPT_emit_obj: opts.programAction = EmitObj; break; - case clang::options::OPT_S: + case clang::driver::options::OPT_S: opts.programAction = EmitAssembly; break; - case clang::options::OPT_fdebug_unparse: + case clang::driver::options::OPT_fdebug_unparse: opts.programAction = DebugUnparse; break; - case clang::options::OPT_fdebug_unparse_no_sema: + case clang::driver::options::OPT_fdebug_unparse_no_sema: opts.programAction = DebugUnparseNoSema; break; - case clang::options::OPT_fdebug_unparse_with_symbols: + case clang::driver::options::OPT_fdebug_unparse_with_symbols: opts.programAction = DebugUnparseWithSymbols; break; - case clang::options::OPT_fdebug_unparse_with_modules: + case clang::driver::options::OPT_fdebug_unparse_with_modules: opts.programAction = DebugUnparseWithModules; break; - case clang::options::OPT_fdebug_dump_symbols: + case clang::driver::options::OPT_fdebug_dump_symbols: opts.programAction = DebugDumpSymbols; break; - case clang::options::OPT_fdebug_dump_parse_tree: + case clang::driver::options::OPT_fdebug_dump_parse_tree: opts.programAction = DebugDumpParseTree; break; - case clang::options::OPT_fdebug_dump_pft: + case clang::driver::options::OPT_fdebug_dump_pft: opts.programAction = DebugDumpPFT; break; - case clang::options::OPT_fdebug_dump_all: + case clang::driver::options::OPT_fdebug_dump_all: opts.programAction = DebugDumpAll; break; - case clang::options::OPT_fdebug_dump_parse_tree_no_sema: + case clang::driver::options::OPT_fdebug_dump_parse_tree_no_sema: opts.programAction = DebugDumpParseTreeNoSema; break; - case clang::options::OPT_fdebug_dump_provenance: + case clang::driver::options::OPT_fdebug_dump_provenance: opts.programAction = DebugDumpProvenance; break; - case clang::options::OPT_fdebug_dump_parsing_log: + case clang::driver::options::OPT_fdebug_dump_parsing_log: opts.programAction = DebugDumpParsingLog; break; - case clang::options::OPT_fdebug_measure_parse_tree: + case clang::driver::options::OPT_fdebug_measure_parse_tree: opts.programAction = DebugMeasureParseTree; break; - case clang::options::OPT_fdebug_pre_fir_tree: + case clang::driver::options::OPT_fdebug_pre_fir_tree: opts.programAction = DebugPreFIRTree; break; - case clang::options::OPT_fget_symbols_sources: + case clang::driver::options::OPT_fget_symbols_sources: opts.programAction = GetSymbolsSources; break; - case clang::options::OPT_fget_definition: + case clang::driver::options::OPT_fget_definition: opts.programAction = GetDefinition; break; - case clang::options::OPT_init_only: + case clang::driver::options::OPT_init_only: opts.programAction = InitOnly; break; // TODO: - // case clang::options::OPT_emit_llvm: - // case clang::options::OPT_emit_llvm_only: - // case clang::options::OPT_emit_codegen_only: - // case clang::options::OPT_emit_module: + // case clang::driver::options::OPT_emit_llvm: + // case clang::driver::options::OPT_emit_llvm_only: + // case clang::driver::options::OPT_emit_codegen_only: + // case clang::driver::options::OPT_emit_module: // (...) } // Parse the values provided with `-fget-definition` (there should be 3 // integers) if (llvm::opt::OptSpecifier(a->getOption().getID()) == - clang::options::OPT_fget_definition) { + clang::driver::options::OPT_fget_definition) { unsigned optVals[3] = {0, 0, 0}; for (unsigned i = 0; i < 3; i++) { @@ -721,25 +731,27 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Parsing -load option and storing shared object path - if (llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_load)) { + if (llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_load)) { opts.plugins.push_back(a->getValue()); } // Parsing -plugin option and storing plugin name and setting action - if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_plugin)) { + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_plugin)) { opts.programAction = PluginAction; opts.actionName = a->getValue(); } - opts.outputFile = args.getLastArgValue(clang::options::OPT_o); - opts.showHelp = args.hasArg(clang::options::OPT_help); - opts.showVersion = args.hasArg(clang::options::OPT_version); + opts.outputFile = args.getLastArgValue(clang::driver::options::OPT_o); + opts.showHelp = args.hasArg(clang::driver::options::OPT_help); + opts.showVersion = args.hasArg(clang::driver::options::OPT_version); opts.printSupportedCPUs = - args.hasArg(clang::options::OPT_print_supported_cpus); + args.hasArg(clang::driver::options::OPT_print_supported_cpus); // Get the input kind (from the value passed via `-x`) InputKind dashX(Language::Unknown); - if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_x)) { + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_x)) { llvm::StringRef xValue = a->getValue(); // Principal languages. dashX = llvm::StringSwitch(xValue) @@ -766,7 +778,7 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, // Collect the input files and save them in our instance of FrontendOptions. std::vector inputs = - args.getAllArgValues(clang::options::OPT_INPUT); + args.getAllArgValues(clang::driver::options::OPT_INPUT); opts.inputs.clear(); if (inputs.empty()) // '-' is the default input if none is given. @@ -786,16 +798,18 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Set fortranForm based on options -ffree-form and -ffixed-form. - if (const auto *arg = args.getLastArg(clang::options::OPT_ffixed_form, - clang::options::OPT_ffree_form)) { - opts.fortranForm = arg->getOption().matches(clang::options::OPT_ffixed_form) - ? FortranForm::FixedForm - : FortranForm::FreeForm; + if (const auto *arg = + args.getLastArg(clang::driver::options::OPT_ffixed_form, + clang::driver::options::OPT_ffree_form)) { + opts.fortranForm = + arg->getOption().matches(clang::driver::options::OPT_ffixed_form) + ? FortranForm::FixedForm + : FortranForm::FreeForm; } // Set fixedFormColumns based on -ffixed-line-length= if (const auto *arg = - args.getLastArg(clang::options::OPT_ffixed_line_length_EQ)) { + args.getLastArg(clang::driver::options::OPT_ffixed_line_length_EQ)) { llvm::StringRef argValue = llvm::StringRef(arg->getValue()); std::int64_t columns = -1; if (argValue == "none") { @@ -817,7 +831,8 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // Set conversion based on -fconvert= - if (const auto *arg = args.getLastArg(clang::options::OPT_fconvert_EQ)) { + if (const auto *arg = + args.getLastArg(clang::driver::options::OPT_fconvert_EQ)) { const char *argValue = arg->getValue(); if (auto convert = parseConvertArg(argValue)) opts.envDefaults.push_back({"FORT_CONVERT", *convert}); @@ -827,55 +842,65 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args, } // -f{no-}implicit-none - opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneTypeAlways, - args.hasFlag(clang::options::OPT_fimplicit_none, - clang::options::OPT_fno_implicit_none, - false)); + opts.features.Enable( + Fortran::common::LanguageFeature::ImplicitNoneTypeAlways, + args.hasFlag(clang::driver::options::OPT_fimplicit_none, + clang::driver::options::OPT_fno_implicit_none, false)); // -f{no-}implicit-none-ext - opts.features.Enable(Fortran::common::LanguageFeature::ImplicitNoneExternal, - args.hasFlag(clang::options::OPT_fimplicit_none_ext, - clang::options::OPT_fno_implicit_none_ext, - false)); + opts.features.Enable( + Fortran::common::LanguageFeature::ImplicitNoneExternal, + args.hasFlag(clang::driver::options::OPT_fimplicit_none_ext, + clang::driver::options::OPT_fno_implicit_none_ext, false)); // -f{no-}backslash opts.features.Enable(Fortran::common::LanguageFeature::BackslashEscapes, - args.hasFlag(clang::options::OPT_fbackslash, - clang::options::OPT_fno_backslash, false)); + args.hasFlag(clang::driver::options::OPT_fbackslash, + clang::driver::options::OPT_fno_backslash, + false)); // -f{no-}logical-abbreviations opts.features.Enable( Fortran::common::LanguageFeature::LogicalAbbreviations, - args.hasFlag(clang::options::OPT_flogical_abbreviations, - clang::options::OPT_fno_logical_abbreviations, false)); + args.hasFlag(clang::driver::options::OPT_flogical_abbreviations, + clang::driver::options::OPT_fno_logical_abbreviations, + false)); // -f{no-}unsigned opts.features.Enable(Fortran::common::LanguageFeature::Unsigned, - args.hasFlag(clang::options::OPT_funsigned, - clang::options::OPT_fno_unsigned, false)); + args.hasFlag(clang::driver::options::OPT_funsigned, + clang::driver::options::OPT_fno_unsigned, + false)); // -f{no-}xor-operator - opts.features.Enable(Fortran::common::LanguageFeature::XOROperator, - args.hasFlag(clang::options::OPT_fxor_operator, - clang::options::OPT_fno_xor_operator, - false)); + opts.features.Enable( + Fortran::common::LanguageFeature::XOROperator, + args.hasFlag(clang::driver::options::OPT_fxor_operator, + clang::driver::options::OPT_fno_xor_operator, false)); // -fno-automatic - if (args.hasArg(clang::options::OPT_fno_automatic)) { + if (args.hasArg(clang::driver::options::OPT_fno_automatic)) { opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave); } // -f{no}-save-main-program - opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram, - args.hasFlag(clang::options::OPT_fsave_main_program, - clang::options::OPT_fno_save_main_program, - false)); + opts.features.Enable( + Fortran::common::LanguageFeature::SaveMainProgram, + args.hasFlag(clang::driver::options::OPT_fsave_main_program, + clang::driver::options::OPT_fno_save_main_program, false)); + + // -ffast-amd-memory-allocator + if (args.hasArg(clang::driver::options::OPT_ffast_amd_memory_allocator)) { + opts.features.Enable( + (Fortran::common::LanguageFeature::AmdMemoryAllocator)); + } - if (args.hasArg(clang::options::OPT_falternative_parameter_statement)) { + if (args.hasArg( + clang::driver::options::OPT_falternative_parameter_statement)) { opts.features.Enable(Fortran::common::LanguageFeature::OldStyleParameter); } if (const llvm::opt::Arg *arg = - args.getLastArg(clang::options::OPT_finput_charset_EQ)) { + args.getLastArg(clang::driver::options::OPT_finput_charset_EQ)) { llvm::StringRef argValue = arg->getValue(); if (argValue == "utf-8") { opts.encoding = Fortran::parser::Encoding::UTF_8; @@ -920,9 +945,9 @@ static std::string getOpenMPHeadersDir(const char *argv) { static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts, llvm::opt::ArgList &args) { // Add macros from the command line. - for (const auto *currentArg : - args.filtered(clang::options::OPT_D, clang::options::OPT_U)) { - if (currentArg->getOption().matches(clang::options::OPT_D)) { + for (const auto *currentArg : args.filtered(clang::driver::options::OPT_D, + clang::driver::options::OPT_U)) { + if (currentArg->getOption().matches(clang::driver::options::OPT_D)) { opts.addMacroDef(currentArg->getValue()); } else { opts.addMacroUndef(currentArg->getValue()); @@ -930,33 +955,34 @@ static void parsePreprocessorArgs(Fortran::frontend::PreprocessorOptions &opts, } // Add the ordered list of -I's. - for (const auto *currentArg : args.filtered(clang::options::OPT_I)) + for (const auto *currentArg : args.filtered(clang::driver::options::OPT_I)) opts.searchDirectoriesFromDashI.emplace_back(currentArg->getValue()); // Prepend the ordered list of -intrinsic-modules-path // to the default location to search. for (const auto *currentArg : - args.filtered(clang::options::OPT_fintrinsic_modules_path)) + args.filtered(clang::driver::options::OPT_fintrinsic_modules_path)) opts.searchDirectoriesFromIntrModPath.emplace_back(currentArg->getValue()); // -cpp/-nocpp - if (const auto *currentArg = - args.getLastArg(clang::options::OPT_cpp, clang::options::OPT_nocpp)) - opts.macrosFlag = (currentArg->getOption().matches(clang::options::OPT_cpp)) - ? PPMacrosFlag::Include - : PPMacrosFlag::Exclude; + if (const auto *currentArg = args.getLastArg( + clang::driver::options::OPT_cpp, clang::driver::options::OPT_nocpp)) + opts.macrosFlag = + (currentArg->getOption().matches(clang::driver::options::OPT_cpp)) + ? PPMacrosFlag::Include + : PPMacrosFlag::Exclude; // Enable -cpp based on -x unless explicitly disabled with -nocpp if (opts.macrosFlag != PPMacrosFlag::Exclude) - if (const auto *dashX = args.getLastArg(clang::options::OPT_x)) + if (const auto *dashX = args.getLastArg(clang::driver::options::OPT_x)) opts.macrosFlag = llvm::StringSwitch(dashX->getValue()) .Case("f95-cpp-input", PPMacrosFlag::Include) .Default(opts.macrosFlag); - opts.noReformat = args.hasArg(clang::options::OPT_fno_reformat); + opts.noReformat = args.hasArg(clang::driver::options::OPT_fno_reformat); opts.preprocessIncludeLines = - args.hasArg(clang::options::OPT_fpreprocess_include_lines); - opts.noLineDirectives = args.hasArg(clang::options::OPT_P); - opts.showMacros = args.hasArg(clang::options::OPT_dM); + args.hasArg(clang::driver::options::OPT_fpreprocess_include_lines); + opts.noLineDirectives = args.hasArg(clang::driver::options::OPT_P); + opts.showMacros = args.hasArg(clang::driver::options::OPT_dM); } /// Parses all semantic related arguments and populates the variables @@ -967,7 +993,7 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -J/module-dir option std::vector moduleDirList = - args.getAllArgValues(clang::options::OPT_module_dir); + args.getAllArgValues(clang::driver::options::OPT_module_dir); // User can only specify one -J/-module-dir directory, but may repeat // -J/-module-dir as long as the directory is the same each time. // https://gcc.gnu.org/onlinedocs/gfortran/Directory-Options.html @@ -986,25 +1012,25 @@ static bool parseSemaArgs(CompilerInvocation &res, llvm::opt::ArgList &args, res.setModuleDir(moduleDirList[0]); // -fdebug-module-writer option - if (args.hasArg(clang::options::OPT_fdebug_module_writer)) { + if (args.hasArg(clang::driver::options::OPT_fdebug_module_writer)) { res.setDebugModuleDir(true); } // -fhermetic-module-files option - if (args.hasArg(clang::options::OPT_fhermetic_module_files)) { + if (args.hasArg(clang::driver::options::OPT_fhermetic_module_files)) { res.setHermeticModuleFileOutput(true); } // -module-suffix if (const auto *moduleSuffix = - args.getLastArg(clang::options::OPT_module_suffix)) { + args.getLastArg(clang::driver::options::OPT_module_suffix)) { res.setModuleFileSuffix(moduleSuffix->getValue()); } // -f{no-}analyzed-objects-for-unparse - res.setUseAnalyzedObjectsForUnparse( - args.hasFlag(clang::options::OPT_fanalyzed_objects_for_unparse, - clang::options::OPT_fno_analyzed_objects_for_unparse, true)); + res.setUseAnalyzedObjectsForUnparse(args.hasFlag( + clang::driver::options::OPT_fanalyzed_objects_for_unparse, + clang::driver::options::OPT_fno_analyzed_objects_for_unparse, true)); return diags.getNumErrors() == numErrorsBefore; } @@ -1021,7 +1047,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // chosen to match clang's behavior. // -pedantic - if (args.hasArg(clang::options::OPT_pedantic)) { + if (args.hasArg(clang::driver::options::OPT_pedantic)) { features.WarnOnAllNonstandard(); features.WarnOnAllUsage(); res.setEnableConformanceChecks(); @@ -1031,8 +1057,9 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -Werror option // TODO: Currently throws a Diagnostic for anything other than -W, // this has to change when other -W's are supported. - if (args.hasArg(clang::options::OPT_W_Joined)) { - const auto &wArgs = args.getAllArgValues(clang::options::OPT_W_Joined); + if (args.hasArg(clang::driver::options::OPT_W_Joined)) { + const auto &wArgs = + args.getAllArgValues(clang::driver::options::OPT_W_Joined); for (const auto &wArg : wArgs) { if (wArg == "error") { res.setWarnAsErr(true); @@ -1049,7 +1076,7 @@ static bool parseDiagArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -w - if (args.hasArg(clang::options::OPT_w)) { + if (args.hasArg(clang::driver::options::OPT_w)) { features.DisableAllWarnings(); res.setDisableWarnings(); } @@ -1069,7 +1096,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, unsigned numErrorsBefore = diags.getNumErrors(); // -fd-lines-as-code - if (args.hasArg(clang::options::OPT_fd_lines_as_code)) { + if (args.hasArg(clang::driver::options::OPT_fd_lines_as_code)) { if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) { const unsigned fdLinesAsWarning = diags.getCustomDiagID( clang::DiagnosticsEngine::Warning, @@ -1082,7 +1109,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -fd-lines-as-comments - if (args.hasArg(clang::options::OPT_fd_lines_as_comments)) { + if (args.hasArg(clang::driver::options::OPT_fd_lines_as_comments)) { if (res.getFrontendOpts().fortranForm == FortranForm::FreeForm) { const unsigned fdLinesAsWarning = diags.getCustomDiagID( clang::DiagnosticsEngine::Warning, @@ -1095,18 +1122,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // -fdefault* family - if (args.hasArg(clang::options::OPT_fdefault_real_8)) { + if (args.hasArg(clang::driver::options::OPT_fdefault_real_8)) { res.getDefaultKinds().set_defaultRealKind(8); res.getDefaultKinds().set_doublePrecisionKind(16); } - if (args.hasArg(clang::options::OPT_fdefault_integer_8)) { + if (args.hasArg(clang::driver::options::OPT_fdefault_integer_8)) { res.getDefaultKinds().set_defaultIntegerKind(8); res.getDefaultKinds().set_subscriptIntegerKind(8); res.getDefaultKinds().set_sizeIntegerKind(8); res.getDefaultKinds().set_defaultLogicalKind(8); } - if (args.hasArg(clang::options::OPT_fdefault_double_8)) { - if (!args.hasArg(clang::options::OPT_fdefault_real_8)) { + if (args.hasArg(clang::driver::options::OPT_fdefault_double_8)) { + if (!args.hasArg(clang::driver::options::OPT_fdefault_real_8)) { // -fdefault-double-8 has to be used with -fdefault-real-8 // to be compatible with gfortran const unsigned diagID = diags.getCustomDiagID( @@ -1117,18 +1144,18 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // https://gcc.gnu.org/onlinedocs/gfortran/Fortran-Dialect-Options.html res.getDefaultKinds().set_doublePrecisionKind(8); } - if (args.hasArg(clang::options::OPT_flarge_sizes)) + if (args.hasArg(clang::driver::options::OPT_flarge_sizes)) res.getDefaultKinds().set_sizeIntegerKind(8); // -x cuda - auto language = args.getLastArgValue(clang::options::OPT_x); + auto language = args.getLastArgValue(clang::driver::options::OPT_x); if (language == "cuda") { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::CUDA); } // -fopenacc - if (args.hasArg(clang::options::OPT_fopenacc)) { + if (args.hasArg(clang::driver::options::OPT_fopenacc)) { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::OpenACC); } @@ -1136,8 +1163,8 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, // -std=f2018 // TODO: Set proper options when more fortran standards // are supported. - if (args.hasArg(clang::options::OPT_std_EQ)) { - auto standard = args.getLastArgValue(clang::options::OPT_std_EQ); + if (args.hasArg(clang::driver::options::OPT_std_EQ)) { + auto standard = args.getLastArgValue(clang::driver::options::OPT_std_EQ); // We only allow f2018 as the given standard if (standard == "f2018") { res.setEnableConformanceChecks(); @@ -1150,7 +1177,7 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } } // -fcoarray - if (args.hasArg(clang::options::OPT_fcoarray)) { + if (args.hasArg(clang::driver::options::OPT_fcoarray)) { res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::Coarray); const unsigned diagID = @@ -1168,12 +1195,13 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, /// generated. static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - llvm::opt::Arg *arg = args.getLastArg(clang::options::OPT_fopenmp, - clang::options::OPT_fno_openmp); - if (!arg || arg->getOption().matches(clang::options::OPT_fno_openmp)) { - bool isSimdSpecified = - args.hasFlag(clang::options::OPT_fopenmp_simd, - clang::options::OPT_fno_openmp_simd, /*Default=*/false); + llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp, + clang::driver::options::OPT_fno_openmp); + if (!arg || + arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) { + bool isSimdSpecified = args.hasFlag( + clang::driver::options::OPT_fopenmp_simd, + clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false); if (!isSimdSpecified) return true; res.getLangOpts().OpenMPSimd = 1; @@ -1189,7 +1217,8 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, res.getLangOpts().OpenMPVersion = newestFullySupported; res.getFrontendOpts().features.Enable( Fortran::common::LanguageFeature::OpenMP); - if (auto *arg = args.getLastArg(clang::options::OPT_fopenmp_version_EQ)) { + if (auto *arg = + args.getLastArg(clang::driver::options::OPT_fopenmp_version_EQ)) { llvm::ArrayRef ompVersions = llvm::omp::getOpenMPVersions(); unsigned oldVersions[] = {11, 20, 25, 30}; unsigned version = 0; @@ -1244,16 +1273,16 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } } - if (args.hasArg(clang::options::OPT_fopenmp_force_usm)) { + if (args.hasArg(clang::driver::options::OPT_fopenmp_force_usm)) { res.getLangOpts().OpenMPForceUSM = 1; } - if (args.hasArg(clang::options::OPT_fopenmp_is_target_device)) { + if (args.hasArg(clang::driver::options::OPT_fopenmp_is_target_device)) { res.getLangOpts().OpenMPIsTargetDevice = 1; // Get OpenMP host file path if any and report if a non existent file is // found - if (auto *arg = - args.getLastArg(clang::options::OPT_fopenmp_host_ir_file_path)) { + if (auto *arg = args.getLastArg( + clang::driver::options::OPT_fopenmp_host_ir_file_path)) { res.getLangOpts().OMPHostIRFile = arg->getValue(); if (!llvm::sys::fs::exists(res.getLangOpts().OMPHostIRFile)) diags.Report(clang::diag::err_omp_host_ir_file_not_found) @@ -1261,34 +1290,37 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } if (args.hasFlag( - clang::options::OPT_fopenmp_assume_teams_oversubscription, - clang::options::OPT_fno_openmp_assume_teams_oversubscription, + clang::driver::options::OPT_fopenmp_assume_teams_oversubscription, + clang::driver::options:: + OPT_fno_openmp_assume_teams_oversubscription, /*Default=*/false)) res.getLangOpts().OpenMPTeamSubscription = true; - if (args.hasArg(clang::options::OPT_fopenmp_assume_no_thread_state)) + if (args.hasArg(clang::driver::options::OPT_fopenmp_assume_no_thread_state)) res.getLangOpts().OpenMPNoThreadState = 1; - if (args.hasArg(clang::options::OPT_fopenmp_assume_no_nested_parallelism)) + if (args.hasArg( + clang::driver::options::OPT_fopenmp_assume_no_nested_parallelism)) res.getLangOpts().OpenMPNoNestedParallelism = 1; if (args.hasFlag( - clang::options::OPT_fopenmp_assume_threads_oversubscription, - clang::options::OPT_fno_openmp_assume_threads_oversubscription, + clang::driver::options::OPT_fopenmp_assume_threads_oversubscription, + clang::driver::options:: + OPT_fno_openmp_assume_threads_oversubscription, /*Default=*/false)) res.getLangOpts().OpenMPThreadSubscription = true; - if ((args.hasArg(clang::options::OPT_fopenmp_target_debug) || - args.hasArg(clang::options::OPT_fopenmp_target_debug_EQ))) { - res.getLangOpts().OpenMPTargetDebug = - getLastArgIntValue(args, clang::options::OPT_fopenmp_target_debug_EQ, - res.getLangOpts().OpenMPTargetDebug, diags); + if ((args.hasArg(clang::driver::options::OPT_fopenmp_target_debug) || + args.hasArg(clang::driver::options::OPT_fopenmp_target_debug_EQ))) { + res.getLangOpts().OpenMPTargetDebug = getLastArgIntValue( + args, clang::driver::options::OPT_fopenmp_target_debug_EQ, + res.getLangOpts().OpenMPTargetDebug, diags); if (!res.getLangOpts().OpenMPTargetDebug && - args.hasArg(clang::options::OPT_fopenmp_target_debug)) + args.hasArg(clang::driver::options::OPT_fopenmp_target_debug)) res.getLangOpts().OpenMPTargetDebug = 1; } - if (args.hasArg(clang::options::OPT_no_offloadlib)) + if (args.hasArg(clang::driver::options::OPT_no_offloadlib)) res.getLangOpts().NoGPULib = 1; } if (llvm::Triple(res.getTargetOpts().triple).isGPU()) { @@ -1304,7 +1336,8 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, } // Get the OpenMP target triples if any. - if (auto *arg = args.getLastArg(clang::options::OPT_offload_targets_EQ)) { + if (auto *arg = + args.getLastArg(clang::driver::options::OPT_offload_targets_EQ)) { enum ArchPtrSize { Arch16Bit, Arch32Bit, Arch64Bit }; auto getArchPtrSize = [](const llvm::Triple &triple) { if (triple.isArch16Bit()) @@ -1352,7 +1385,7 @@ static bool parseIntegerOverflowArgs(CompilerInvocation &invoc, clang::DiagnosticsEngine &diags) { Fortran::common::LangOptions &opts = invoc.getLangOpts(); - if (args.getLastArg(clang::options::OPT_fwrapv)) + if (args.getLastArg(clang::driver::options::OPT_fwrapv)) opts.setSignedOverflowBehavior(Fortran::common::LangOptions::SOB_Defined); return true; @@ -1371,7 +1404,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, Fortran::common::LangOptions &opts = invoc.getLangOpts(); if (const llvm::opt::Arg *a = - args.getLastArg(clang::options::OPT_ffp_contract)) { + args.getLastArg(clang::driver::options::OPT_ffp_contract)) { const llvm::StringRef val = a->getValue(); enum Fortran::common::LangOptions::FPModeKind fpContractMode; @@ -1388,31 +1421,31 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, opts.setFPContractMode(fpContractMode); } - if (args.getLastArg(clang::options::OPT_menable_no_infs)) { + if (args.getLastArg(clang::driver::options::OPT_menable_no_infs)) { opts.NoHonorInfs = true; } - if (args.getLastArg(clang::options::OPT_menable_no_nans)) { + if (args.getLastArg(clang::driver::options::OPT_menable_no_nans)) { opts.NoHonorNaNs = true; } - if (args.getLastArg(clang::options::OPT_fapprox_func)) { + if (args.getLastArg(clang::driver::options::OPT_fapprox_func)) { opts.ApproxFunc = true; } - if (args.getLastArg(clang::options::OPT_fno_signed_zeros)) { + if (args.getLastArg(clang::driver::options::OPT_fno_signed_zeros)) { opts.NoSignedZeros = true; } - if (args.getLastArg(clang::options::OPT_mreassociate)) { + if (args.getLastArg(clang::driver::options::OPT_mreassociate)) { opts.AssociativeMath = true; } - if (args.getLastArg(clang::options::OPT_freciprocal_math)) { + if (args.getLastArg(clang::driver::options::OPT_freciprocal_math)) { opts.ReciprocalMath = true; } - if (args.getLastArg(clang::options::OPT_ffast_math)) { + if (args.getLastArg(clang::driver::options::OPT_ffast_math)) { opts.NoHonorInfs = true; opts.NoHonorNaNs = true; opts.AssociativeMath = true; @@ -1422,7 +1455,7 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast); } - if (args.hasArg(clang::options::OPT_fno_fast_real_mod)) + if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod)) opts.NoFastRealMod = true; return true; @@ -1437,8 +1470,10 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc, /// \param [out] diags DiagnosticsEngine to report erros with static bool parseVScaleArgs(CompilerInvocation &invoc, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { - const auto *vscaleMin = args.getLastArg(clang::options::OPT_mvscale_min_EQ); - const auto *vscaleMax = args.getLastArg(clang::options::OPT_mvscale_max_EQ); + const auto *vscaleMin = + args.getLastArg(clang::driver::options::OPT_mvscale_min_EQ); + const auto *vscaleMax = + args.getLastArg(clang::driver::options::OPT_mvscale_max_EQ); if (!vscaleMin && !vscaleMax) return true; @@ -1486,7 +1521,8 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, // TODO: support --dependent-lib on other platforms when MLIR supports // !llvm.dependent.lib - if (args.hasArg(clang::options::OPT_dependent_lib) && !triple.isOSWindows()) { + if (args.hasArg(clang::driver::options::OPT_dependent_lib) && + !triple.isOSWindows()) { const unsigned diagID = diags.getCustomDiagID(clang::DiagnosticsEngine::Error, "--dependent-lib is only supported on Windows"); @@ -1494,10 +1530,12 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, return false; } - opts.DependentLibs = args.getAllArgValues(clang::options::OPT_dependent_lib); + opts.DependentLibs = + args.getAllArgValues(clang::driver::options::OPT_dependent_lib); // -flto=full/thin option. - if (const llvm::opt::Arg *a = args.getLastArg(clang::options::OPT_flto_EQ)) { + if (const llvm::opt::Arg *a = + args.getLastArg(clang::driver::options::OPT_flto_EQ)) { llvm::StringRef s = a->getValue(); assert((s == "full" || s == "thin") && "Unknown LTO mode."); if (s == "full") @@ -1508,10 +1546,10 @@ static bool parseLinkerOptionsArgs(CompilerInvocation &invoc, // -ffat-lto-objects if (const llvm::opt::Arg *arg = - args.getLastArg(clang::options::OPT_ffat_lto_objects, - clang::options::OPT_fno_fat_lto_objects)) { + args.getLastArg(clang::driver::options::OPT_ffat_lto_objects, + clang::driver::options::OPT_fno_fat_lto_objects)) { opts.PrepareForFatLTO = - arg->getOption().matches(clang::options::OPT_ffat_lto_objects); + arg->getOption().matches(clang::driver::options::OPT_ffat_lto_objects); if (opts.PrepareForFatLTO) { assert((opts.PrepareForFullLTO || opts.PrepareForThinLTO) && "Unknown LTO mode"); @@ -1552,8 +1590,8 @@ bool CompilerInvocation::createFromArgs( llvm::Triple::normalize(llvm::sys::getDefaultTargetTriple()); // Parse the arguments - const llvm::opt::OptTable &opts = clang::getDriverOptTable(); - llvm::opt::Visibility visibilityMask(clang::options::FC1Option); + const llvm::opt::OptTable &opts = clang::driver::getDriverOptTable(); + llvm::opt::Visibility visibilityMask(clang::driver::options::FC1Option); unsigned missingArgIndex, missingArgCount; llvm::opt::InputArgList args = opts.ParseArgs( commandLineArgs, missingArgIndex, missingArgCount, visibilityMask); @@ -1566,7 +1604,7 @@ bool CompilerInvocation::createFromArgs( } // Issue errors on unknown arguments - for (const auto *a : args.filtered(clang::options::OPT_UNKNOWN)) { + for (const auto *a : args.filtered(clang::driver::options::OPT_UNKNOWN)) { auto argString = a->getAsString(args); std::string nearest; if (opts.findNearest(argString, nearest, visibilityMask) > 1) @@ -1578,15 +1616,15 @@ bool CompilerInvocation::createFromArgs( } // -flang-experimental-hlfir - if (args.hasArg(clang::options::OPT_flang_experimental_hlfir) || - args.hasArg(clang::options::OPT_emit_hlfir)) { + if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir) || + args.hasArg(clang::driver::options::OPT_emit_hlfir)) { invoc.loweringOpts.setLowerToHighLevelFIR(true); } // -flang-deprecated-no-hlfir - if (args.hasArg(clang::options::OPT_flang_deprecated_no_hlfir) && - !args.hasArg(clang::options::OPT_emit_hlfir)) { - if (args.hasArg(clang::options::OPT_flang_experimental_hlfir)) { + if (args.hasArg(clang::driver::options::OPT_flang_deprecated_no_hlfir) && + !args.hasArg(clang::driver::options::OPT_emit_hlfir)) { + if (args.hasArg(clang::driver::options::OPT_flang_experimental_hlfir)) { const unsigned diagID = diags.getCustomDiagID( clang::DiagnosticsEngine::Error, "Options '-flang-experimental-hlfir' and " @@ -1597,13 +1635,13 @@ bool CompilerInvocation::createFromArgs( } // -fno-ppc-native-vector-element-order - if (args.hasArg(clang::options::OPT_fno_ppc_native_vec_elem_order)) { + if (args.hasArg(clang::driver::options::OPT_fno_ppc_native_vec_elem_order)) { invoc.loweringOpts.setNoPPCNativeVecElemOrder(true); } // -f[no-]init-global-zero - if (args.hasFlag(clang::options::OPT_finit_global_zero, - clang::options::OPT_fno_init_global_zero, + if (args.hasFlag(clang::driver::options::OPT_finit_global_zero, + clang::driver::options::OPT_fno_init_global_zero, /*default=*/true)) invoc.loweringOpts.setInitGlobalZero(true); else @@ -1612,8 +1650,8 @@ bool CompilerInvocation::createFromArgs( // Preserve all the remark options requested, i.e. -Rpass, -Rpass-missed or // -Rpass-analysis. This will be used later when processing and outputting the // remarks generated by LLVM in ExecuteCompilerInvocation.cpp. - for (auto *a : args.filtered(clang::options::OPT_R_Group)) { - if (a->getOption().matches(clang::options::OPT_R_value_Group)) + for (auto *a : args.filtered(clang::driver::options::OPT_R_Group)) { + if (a->getOption().matches(clang::driver::options::OPT_R_value_Group)) // This is -Rfoo=, where foo is the name of the diagnostic // group. Add only the remark option name to the diagnostics. e.g. for // -Rpass= we will add the string "pass". @@ -1626,19 +1664,20 @@ bool CompilerInvocation::createFromArgs( } // -frealloc-lhs is the default. - if (!args.hasFlag(clang::options::OPT_frealloc_lhs, - clang::options::OPT_fno_realloc_lhs, true)) + if (!args.hasFlag(clang::driver::options::OPT_frealloc_lhs, + clang::driver::options::OPT_fno_realloc_lhs, true)) invoc.loweringOpts.setReallocateLHS(false); - invoc.loweringOpts.setRepackArrays(args.hasFlag( - clang::options::OPT_frepack_arrays, clang::options::OPT_fno_repack_arrays, - /*default=*/false)); + invoc.loweringOpts.setRepackArrays( + args.hasFlag(clang::driver::options::OPT_frepack_arrays, + clang::driver::options::OPT_fno_repack_arrays, + /*default=*/false)); invoc.loweringOpts.setStackRepackArrays( - args.hasFlag(clang::options::OPT_fstack_repack_arrays, - clang::options::OPT_fno_stack_repack_arrays, + args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays, + clang::driver::options::OPT_fno_stack_repack_arrays, /*default=*/false)); - if (auto *arg = - args.getLastArg(clang::options::OPT_frepack_arrays_contiguity_EQ)) + if (auto *arg = args.getLastArg( + clang::driver::options::OPT_frepack_arrays_contiguity_EQ)) invoc.loweringOpts.setRepackArraysWhole(arg->getValue() == llvm::StringRef{"whole"}); @@ -1658,8 +1697,10 @@ bool CompilerInvocation::createFromArgs( // `mlirArgs`. Instead, you can use // * `-mllvm `, or // * `-mmlir `. - invoc.frontendOpts.llvmArgs = args.getAllArgValues(clang::options::OPT_mllvm); - invoc.frontendOpts.mlirArgs = args.getAllArgValues(clang::options::OPT_mmlir); + invoc.frontendOpts.llvmArgs = + args.getAllArgValues(clang::driver::options::OPT_mllvm); + invoc.frontendOpts.mlirArgs = + args.getAllArgValues(clang::driver::options::OPT_mmlir); success &= parseLangOptionsArgs(invoc, args, diags); @@ -1683,7 +1724,7 @@ bool CompilerInvocation::createFromArgs( } // Process the timing-related options. - if (args.hasArg(clang::options::OPT_ftime_report)) + if (args.hasArg(clang::driver::options::OPT_ftime_report)) invoc.enableTimers = true; invoc.setArgv0(argv0); diff --git a/flang/lib/FrontendTool/CMakeLists.txt b/flang/lib/FrontendTool/CMakeLists.txt index b69436c36d438..faf56e9d955a1 100644 --- a/flang/lib/FrontendTool/CMakeLists.txt +++ b/flang/lib/FrontendTool/CMakeLists.txt @@ -18,6 +18,5 @@ add_flang_library(flangFrontendTool CLANG_LIBS clangBasic - clangOptions clangDriver ) diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index 7586be59ba01b..09ac129d3e689 100644 --- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -23,7 +23,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/Pass/PassManager.h" #include "clang/Basic/DiagnosticFrontend.h" -#include "clang/Options/Options.h" +#include "clang/Driver/Options.h" #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" #include "llvm/Support/BuryPointer.h" @@ -153,10 +153,10 @@ updateDiagEngineForOptRemarks(clang::DiagnosticsEngine &diagsEng, bool executeCompilerInvocation(CompilerInstance *flang) { // Honor -help. if (flang->getFrontendOpts().showHelp) { - clang::getDriverOptTable().printHelp( + clang::driver::getDriverOptTable().printHelp( llvm::outs(), "flang -fc1 [options] file...", "LLVM 'Flang' Compiler", /*ShowHidden=*/false, /*ShowAllAliases=*/false, - llvm::opt::Visibility(clang::options::FC1Option)); + llvm::opt::Visibility(clang::driver::options::FC1Option)); return true; } diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt index 3caec17b3edbb..801fc324e888d 100644 --- a/flang/tools/flang-driver/CMakeLists.txt +++ b/flang/tools/flang-driver/CMakeLists.txt @@ -26,7 +26,6 @@ target_link_libraries(flang clang_target_link_libraries(flang PRIVATE clangDriver - clangOptions clangBasic ) diff --git a/flang/tools/flang-driver/driver.cpp b/flang/tools/flang-driver/driver.cpp index 0840255a739f3..bd878b7a642f1 100644 --- a/flang/tools/flang-driver/driver.cpp +++ b/flang/tools/flang-driver/driver.cpp @@ -52,9 +52,9 @@ createAndPopulateDiagOpts(llvm::ArrayRef argv) { // Any errors that would be diagnosed here will also be diagnosed later, // when the DiagnosticsEngine actually exists. unsigned missingArgIndex, missingArgCount; - llvm::opt::InputArgList args = clang::getDriverOptTable().ParseArgs( + llvm::opt::InputArgList args = clang::driver::getDriverOptTable().ParseArgs( argv.slice(1), missingArgIndex, missingArgCount, - llvm::opt::Visibility(clang::options::FlangOption)); + llvm::opt::Visibility(clang::driver::options::FlangOption)); (void)Fortran::frontend::parseDiagnosticArgs(*diagOpts, args); diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp index bfbd85ea34203..8b4a3e0a7c3fb 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp @@ -56,7 +56,7 @@ using namespace lldb; using namespace lldb_private; #define OPTTABLE_STR_TABLE_CODE -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef OPTTABLE_STR_TABLE_CODE static Status ExceptionMaskValidator(const char *string, void *unused) { @@ -1124,7 +1124,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType( #define OPTION(PREFIX_OFFSET, NAME_OFFSET, VAR, ...) \ llvm::StringRef opt_##VAR = OptionStrTable[NAME_OFFSET]; \ (void)opt_##VAR; -#include "clang/Options/Options.inc" +#include "clang/Driver/Options.inc" #undef OPTION minimum_version_option << '-'; switch (sdk_type) { diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index 1716492e73ff9..3b3c26fee02ec 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -406,7 +406,7 @@ class LLVM_ABI MCAsmInfo { // Generated object files can use all ELF features supported by GNU ld of // this binutils version and later. INT_MAX means all features can be used, // regardless of GNU ld support. The default value is referenced by - // clang/Options/Options.td. + // clang/Driver/Options.td. std::pair BinutilsVersion = {2, 26}; /// Should we use the integrated assembler? diff --git a/llvm/include/llvm/Option/Arg.h b/llvm/include/llvm/Option/Arg.h index 496373d28600f..b1e56b58da684 100644 --- a/llvm/include/llvm/Option/Arg.h +++ b/llvm/include/llvm/Option/Arg.h @@ -51,7 +51,7 @@ class Arg { /// Was this argument used to affect compilation? /// /// This is used to generate an "argument unused" warning (without - /// clang::options::TargetSpecific) or "unsupported option" error + /// clang::driver::options::TargetSpecific) or "unsupported option" error /// (with TargetSpecific). mutable unsigned Claimed : 1; diff --git a/revert_patches.txt b/revert_patches.txt index b88b846f64b68..6be344f862546 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -8,3 +8,6 @@ breaks build of ROCmValidationSuite breaks fortran declare-target-link1 [OMPIRBuilder] Fix addrspace of internal critical section lock (#166459 --- +complicated to land parallel-EQ +Reland "[clang] Refactor option-related code from clangDriver into new clangOptions library" (#167374) +--- diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 36b71dd49ef13..4d279bffd3723 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1501,9 +1501,9 @@ cc_library( gentbl_cc_library( name = "driver_options_inc_gen", - tbl_outs = {"include/clang/Options/Options.inc": ["-gen-opt-parser-defs"]}, + tbl_outs = {"include/clang/Driver/Options.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", - td_file = "include/clang/Options/Options.td", + td_file = "include/clang/Driver/Options.td", deps = ["//llvm:OptParserTdFiles"], )